Support configurable quote character parameter in Select (#8955)

This commit is contained in:
Anis Elleuch
2020-03-13 22:09:34 -07:00
committed by GitHub
parent 3ca9f5ffa3
commit 35ecc04223
18 changed files with 567 additions and 142 deletions

View File

@@ -18,8 +18,11 @@ package csv
import (
"encoding/xml"
"errors"
"fmt"
"io"
"strings"
"unicode/utf8"
)
const (
@@ -55,68 +58,64 @@ func (args *ReaderArgs) IsEmpty() bool {
}
// UnmarshalXML - decodes XML data.
func (args *ReaderArgs) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
// Make subtype to avoid recursive UnmarshalXML().
type subReaderArgs ReaderArgs
parsedArgs := subReaderArgs{}
if err := d.DecodeElement(&parsedArgs, &start); err != nil {
return err
func (args *ReaderArgs) UnmarshalXML(d *xml.Decoder, start xml.StartElement) (err error) {
args.FileHeaderInfo = none
args.RecordDelimiter = defaultRecordDelimiter
args.FieldDelimiter = defaultFieldDelimiter
args.QuoteCharacter = defaultQuoteCharacter
args.QuoteEscapeCharacter = defaultQuoteEscapeCharacter
args.CommentCharacter = defaultCommentCharacter
args.AllowQuotedRecordDelimiter = false
for {
// Read tokens from the XML document in a stream.
t, err := d.Token()
if err != nil {
if err == io.EOF {
break
}
return err
}
switch se := t.(type) {
case xml.StartElement:
tagName := se.Name.Local
switch tagName {
case "AllowQuotedRecordDelimiter":
var b bool
if err = d.DecodeElement(&b, &se); err != nil {
return err
}
args.AllowQuotedRecordDelimiter = b
default:
var s string
if err = d.DecodeElement(&s, &se); err != nil {
return err
}
switch tagName {
case "FileHeaderInfo":
args.FileHeaderInfo = strings.ToLower(s)
case "RecordDelimiter":
args.RecordDelimiter = s
case "FieldDelimiter":
args.FieldDelimiter = s
case "QuoteCharacter":
if utf8.RuneCountInString(s) > 1 {
return fmt.Errorf("unsupported QuoteCharacter '%v'", s)
}
args.QuoteCharacter = s
// Not supported yet
case "QuoteEscapeCharacter":
case "Comments":
args.CommentCharacter = s
default:
return errors.New("unrecognized option")
}
}
}
}
parsedArgs.FileHeaderInfo = strings.ToLower(parsedArgs.FileHeaderInfo)
switch parsedArgs.FileHeaderInfo {
case "":
parsedArgs.FileHeaderInfo = none
case none, use, ignore:
default:
return errInvalidFileHeaderInfo(fmt.Errorf("invalid FileHeaderInfo '%v'", parsedArgs.FileHeaderInfo))
}
switch len([]rune(parsedArgs.RecordDelimiter)) {
case 0:
parsedArgs.RecordDelimiter = defaultRecordDelimiter
case 1, 2:
default:
return fmt.Errorf("invalid RecordDelimiter '%v'", parsedArgs.RecordDelimiter)
}
switch len([]rune(parsedArgs.FieldDelimiter)) {
case 0:
parsedArgs.FieldDelimiter = defaultFieldDelimiter
case 1:
default:
return fmt.Errorf("invalid FieldDelimiter '%v'", parsedArgs.FieldDelimiter)
}
switch parsedArgs.QuoteCharacter {
case "":
parsedArgs.QuoteCharacter = defaultQuoteCharacter
case defaultQuoteCharacter:
default:
return fmt.Errorf("unsupported QuoteCharacter '%v'", parsedArgs.QuoteCharacter)
}
switch parsedArgs.QuoteEscapeCharacter {
case "":
parsedArgs.QuoteEscapeCharacter = defaultQuoteEscapeCharacter
case defaultQuoteEscapeCharacter:
default:
return fmt.Errorf("unsupported QuoteEscapeCharacter '%v'", parsedArgs.QuoteEscapeCharacter)
}
switch parsedArgs.CommentCharacter {
case "":
parsedArgs.CommentCharacter = defaultCommentCharacter
case defaultCommentCharacter:
default:
return fmt.Errorf("unsupported Comments '%v'", parsedArgs.CommentCharacter)
}
if parsedArgs.AllowQuotedRecordDelimiter {
return fmt.Errorf("flag AllowQuotedRecordDelimiter is unsupported at the moment")
}
*args = ReaderArgs(parsedArgs)
args.QuoteEscapeCharacter = args.QuoteCharacter
args.unmarshaled = true
return nil
}
@@ -138,55 +137,54 @@ func (args *WriterArgs) IsEmpty() bool {
// UnmarshalXML - decodes XML data.
func (args *WriterArgs) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
// Make subtype to avoid recursive UnmarshalXML().
type subWriterArgs WriterArgs
parsedArgs := subWriterArgs{}
if err := d.DecodeElement(&parsedArgs, &start); err != nil {
return err
args.QuoteFields = asneeded
args.RecordDelimiter = defaultRecordDelimiter
args.FieldDelimiter = defaultFieldDelimiter
args.QuoteCharacter = defaultQuoteCharacter
args.QuoteEscapeCharacter = defaultQuoteCharacter
for {
// Read tokens from the XML document in a stream.
t, err := d.Token()
if err != nil {
if err == io.EOF {
break
}
return err
}
switch se := t.(type) {
case xml.StartElement:
var s string
if err = d.DecodeElement(&s, &se); err != nil {
return err
}
switch se.Name.Local {
case "QuoteFields":
args.QuoteFields = strings.ToLower(s)
case "RecordDelimiter":
args.RecordDelimiter = s
case "FieldDelimiter":
args.FieldDelimiter = s
case "QuoteCharacter":
switch utf8.RuneCountInString(s) {
case 0:
args.QuoteCharacter = "\x00"
case 1:
args.QuoteCharacter = s
default:
return fmt.Errorf("unsupported QuoteCharacter '%v'", s)
}
// Not supported yet
case "QuoteEscapeCharacter":
default:
return errors.New("unrecognized option")
}
}
}
parsedArgs.QuoteFields = strings.ToLower(parsedArgs.QuoteFields)
switch parsedArgs.QuoteFields {
case "":
parsedArgs.QuoteFields = asneeded
case always, asneeded:
default:
return errInvalidQuoteFields(fmt.Errorf("invalid QuoteFields '%v'", parsedArgs.QuoteFields))
}
switch len([]rune(parsedArgs.RecordDelimiter)) {
case 0:
parsedArgs.RecordDelimiter = defaultRecordDelimiter
case 1, 2:
default:
return fmt.Errorf("invalid RecordDelimiter '%v'", parsedArgs.RecordDelimiter)
}
switch len([]rune(parsedArgs.FieldDelimiter)) {
case 0:
parsedArgs.FieldDelimiter = defaultFieldDelimiter
case 1:
default:
return fmt.Errorf("invalid FieldDelimiter '%v'", parsedArgs.FieldDelimiter)
}
switch parsedArgs.QuoteCharacter {
case "":
parsedArgs.QuoteCharacter = defaultQuoteCharacter
case defaultQuoteCharacter:
default:
return fmt.Errorf("unsupported QuoteCharacter '%v'", parsedArgs.QuoteCharacter)
}
switch parsedArgs.QuoteEscapeCharacter {
case "":
parsedArgs.QuoteEscapeCharacter = defaultQuoteEscapeCharacter
case defaultQuoteEscapeCharacter:
default:
return fmt.Errorf("unsupported QuoteEscapeCharacter '%v'", parsedArgs.QuoteEscapeCharacter)
}
*args = WriterArgs(parsedArgs)
args.QuoteEscapeCharacter = args.QuoteCharacter
args.unmarshaled = true
return nil
}

View File

@@ -294,6 +294,11 @@ func NewReader(readCloser io.ReadCloser, args *ReaderArgs) (*Reader, error) {
ret := csv.NewReader(r)
ret.Comma = []rune(args.FieldDelimiter)[0]
ret.Comment = []rune(args.CommentCharacter)[0]
ret.Quote = []rune{}
if len([]rune(args.QuoteCharacter)) > 0 {
// Add the first rune of args.QuoteChracter
ret.Quote = append(ret.Quote, []rune(args.QuoteCharacter)[0])
}
ret.FieldsPerRecord = -1
// If LazyQuotes is true, a quote may appear in an unquoted field and a
// non-doubled quote may appear in a quoted field.

View File

@@ -63,7 +63,7 @@ func TestRead(t *testing.T) {
if err != nil {
break
}
record.WriteCSV(&result, []rune(c.fieldDelimiter)[0])
record.WriteCSV(&result, []rune(c.fieldDelimiter)[0], '"', false)
result.Truncate(result.Len() - 1)
result.WriteString(c.recordDelimiter)
}
@@ -243,7 +243,7 @@ func TestReadExtended(t *testing.T) {
}
if fields < 10 {
// Write with fixed delimiters, newlines.
err := record.WriteCSV(&result, ',')
err := record.WriteCSV(&result, ',', '"', false)
if err != nil {
t.Error(err)
}
@@ -454,7 +454,7 @@ func TestReadFailures(t *testing.T) {
break
}
// Write with fixed delimiters, newlines.
err := record.WriteCSV(&result, ',')
err := record.WriteCSV(&result, ',', '"', false)
if err != nil {
t.Error(err)
}

View File

@@ -92,9 +92,11 @@ func (r *Record) Clone(dst sql.Record) sql.Record {
}
// WriteCSV - encodes to CSV data.
func (r *Record) WriteCSV(writer io.Writer, fieldDelimiter rune) error {
func (r *Record) WriteCSV(writer io.Writer, fieldDelimiter rune, quote rune, alwaysQuote bool) error {
w := csv.NewWriter(writer)
w.Comma = fieldDelimiter
w.AlwaysQuote = alwaysQuote
w.Quote = quote
if err := w.Write(r.csvRecord); err != nil {
return err
}

View File

@@ -17,7 +17,6 @@
package json
import (
"encoding/csv"
"encoding/json"
"errors"
"fmt"
@@ -27,6 +26,7 @@ import (
"strings"
"github.com/bcicen/jstream"
csv "github.com/minio/minio/pkg/csvparser"
"github.com/minio/minio/pkg/s3select/sql"
)
@@ -108,7 +108,7 @@ func (r *Record) Set(name string, value *sql.Value) (sql.Record, error) {
}
// WriteCSV - encodes to CSV data.
func (r *Record) WriteCSV(writer io.Writer, fieldDelimiter rune) error {
func (r *Record) WriteCSV(writer io.Writer, fieldDelimiter rune, quote rune, alwaysQuote bool) error {
var csvRecord []string
for _, kv := range r.KVS {
var columnValue string
@@ -137,6 +137,8 @@ func (r *Record) WriteCSV(writer io.Writer, fieldDelimiter rune) error {
w := csv.NewWriter(writer)
w.Comma = fieldDelimiter
w.Quote = quote
w.AlwaysQuote = alwaysQuote
if err := w.Write(csvRecord); err != nil {
return err
}

View File

@@ -353,7 +353,10 @@ func (s3Select *S3Select) marshal(buf *bytes.Buffer, record sql.Record) error {
}()
bufioWriter.Reset(buf)
err := record.WriteCSV(bufioWriter, []rune(s3Select.Output.CSVArgs.FieldDelimiter)[0])
err := record.WriteCSV(bufioWriter,
[]rune(s3Select.Output.CSVArgs.FieldDelimiter)[0],
[]rune(s3Select.Output.CSVArgs.QuoteCharacter)[0],
strings.ToLower(s3Select.Output.CSVArgs.QuoteFields) == "always")
if err != nil {
return err
}

View File

@@ -252,6 +252,7 @@ func TestJSONQueries(t *testing.T) {
</InputSerialization>
<OutputSerialization>
<CSV>
<QuoteCharacter>"</QuoteCharacter>
</CSV>
</OutputSerialization>
<RequestProgress>
@@ -587,6 +588,7 @@ func TestCSVQueries2(t *testing.T) {
<CompressionType>NONE</CompressionType>
<CSV>
<FileHeaderInfo>USE</FileHeaderInfo>
<QuoteCharacter>"</QuoteCharacter>
</CSV>
</InputSerialization>
<OutputSerialization>

View File

@@ -131,11 +131,11 @@ func TestNDJSON(t *testing.T) {
t.Error(err)
}
var gotB, wantB bytes.Buffer
err = rec.WriteCSV(&gotB, ',')
err = rec.WriteCSV(&gotB, ',', '"', false)
if err != nil {
t.Error(err)
}
err = want.WriteCSV(&wantB, ',')
err = want.WriteCSV(&wantB, ',', '"', false)
if err != nil {
t.Error(err)
}

View File

@@ -17,10 +17,11 @@
package simdj
import (
"encoding/csv"
"fmt"
"io"
csv "github.com/minio/minio/pkg/csvparser"
"github.com/bcicen/jstream"
"github.com/minio/minio/pkg/s3select/json"
"github.com/minio/minio/pkg/s3select/sql"
@@ -140,7 +141,7 @@ func (r *Record) Set(name string, value *sql.Value) (sql.Record, error) {
}
// WriteCSV - encodes to CSV data.
func (r *Record) WriteCSV(writer io.Writer, fieldDelimiter rune) error {
func (r *Record) WriteCSV(writer io.Writer, fieldDelimiter, quote rune, alwaysQuote bool) error {
csvRecord := make([]string, 0, 10)
var tmp simdjson.Iter
obj := r.object
@@ -173,6 +174,8 @@ allElems:
}
w := csv.NewWriter(writer)
w.Comma = fieldDelimiter
w.Quote = quote
w.AlwaysQuote = alwaysQuote
if err := w.Write(csvRecord); err != nil {
return err
}

View File

@@ -46,7 +46,7 @@ type Record interface {
// Set a value.
// Can return a different record type.
Set(name string, value *Value) (Record, error)
WriteCSV(writer io.Writer, fieldDelimiter rune) error
WriteCSV(writer io.Writer, fieldDelimiter, quote rune, alwaysQuote bool) error
WriteJSON(writer io.Writer) error
// Clone the record and if possible use the destination provided.