sql: Add support of escape quote in CSV (#9231)

This commit modifies csv parser, a fork of golang csv
parser to support a custom quote escape character.

The quote escape character is used to escape the quote
character when a csv field contains a quote character
as part of data.
This commit is contained in:
Anis Elleuch
2020-04-01 23:39:34 +01:00
committed by GitHub
parent 7de29e6e6b
commit 9902c9baaa
12 changed files with 153 additions and 64 deletions

View File

@@ -104,8 +104,15 @@ func (args *ReaderArgs) UnmarshalXML(d *xml.Decoder, start xml.StartElement) (er
return fmt.Errorf("unsupported QuoteCharacter '%v'", s)
}
args.QuoteCharacter = s
// Not supported yet
case "QuoteEscapeCharacter":
switch utf8.RuneCountInString(s) {
case 0:
args.QuoteEscapeCharacter = defaultQuoteEscapeCharacter
case 1:
args.QuoteEscapeCharacter = s
default:
return fmt.Errorf("unsupported QuoteEscapeCharacter '%v'", s)
}
case "Comments":
args.CommentCharacter = s
default:
@@ -115,7 +122,6 @@ func (args *ReaderArgs) UnmarshalXML(d *xml.Decoder, start xml.StartElement) (er
}
}
args.QuoteEscapeCharacter = args.QuoteCharacter
args.unmarshaled = true
return nil
}
@@ -176,15 +182,21 @@ func (args *WriterArgs) UnmarshalXML(d *xml.Decoder, start xml.StartElement) err
default:
return fmt.Errorf("unsupported QuoteCharacter '%v'", s)
}
// Not supported yet
case "QuoteEscapeCharacter":
switch utf8.RuneCountInString(s) {
case 0:
args.QuoteEscapeCharacter = defaultQuoteEscapeCharacter
case 1:
args.QuoteEscapeCharacter = s
default:
return fmt.Errorf("unsupported QuoteCharacter '%v'", s)
}
default:
return errors.New("unrecognized option")
}
}
}
args.QuoteEscapeCharacter = args.QuoteCharacter
args.unmarshaled = true
return nil
}

View File

@@ -299,6 +299,7 @@ func NewReader(readCloser io.ReadCloser, args *ReaderArgs) (*Reader, error) {
// Add the first rune of args.QuoteChracter
ret.Quote = append(ret.Quote, []rune(args.QuoteCharacter)[0])
}
ret.QuoteEscape = []rune(args.QuoteEscapeCharacter)[0]
ret.FieldsPerRecord = -1
// If LazyQuotes is true, a quote may appear in an unquoted field and a
// non-doubled quote may appear in a quoted field.

View File

@@ -63,7 +63,13 @@ func TestRead(t *testing.T) {
if err != nil {
break
}
record.WriteCSV(&result, []rune(c.fieldDelimiter)[0], '"', false)
opts := sql.WriteCSVOpts{
FieldDelimiter: []rune(c.fieldDelimiter)[0],
Quote: '"',
QuoteEscape: '"',
AlwaysQuote: false,
}
record.WriteCSV(&result, opts)
result.Truncate(result.Len() - 1)
result.WriteString(c.recordDelimiter)
}
@@ -242,8 +248,14 @@ func TestReadExtended(t *testing.T) {
break
}
if fields < 10 {
opts := sql.WriteCSVOpts{
FieldDelimiter: ',',
Quote: '"',
QuoteEscape: '"',
AlwaysQuote: false,
}
// Write with fixed delimiters, newlines.
err := record.WriteCSV(&result, ',', '"', false)
err := record.WriteCSV(&result, opts)
if err != nil {
t.Error(err)
}
@@ -453,8 +465,15 @@ func TestReadFailures(t *testing.T) {
if err != nil {
break
}
opts := sql.WriteCSVOpts{
FieldDelimiter: ',',
Quote: '"',
QuoteEscape: '"',
AlwaysQuote: false,
}
// Write with fixed delimiters, newlines.
err := record.WriteCSV(&result, ',', '"', false)
err := record.WriteCSV(&result, opts)
if err != nil {
t.Error(err)
}

View File

@@ -92,11 +92,12 @@ func (r *Record) Clone(dst sql.Record) sql.Record {
}
// WriteCSV - encodes to CSV data.
func (r *Record) WriteCSV(writer io.Writer, fieldDelimiter rune, quote rune, alwaysQuote bool) error {
func (r *Record) WriteCSV(writer io.Writer, opts sql.WriteCSVOpts) error {
w := csv.NewWriter(writer)
w.Comma = fieldDelimiter
w.AlwaysQuote = alwaysQuote
w.Quote = quote
w.Comma = opts.FieldDelimiter
w.AlwaysQuote = opts.AlwaysQuote
w.Quote = opts.Quote
w.QuoteEscape = opts.QuoteEscape
if err := w.Write(r.csvRecord); err != nil {
return err
}

View File

@@ -108,7 +108,7 @@ func (r *Record) Set(name string, value *sql.Value) (sql.Record, error) {
}
// WriteCSV - encodes to CSV data.
func (r *Record) WriteCSV(writer io.Writer, fieldDelimiter rune, quote rune, alwaysQuote bool) error {
func (r *Record) WriteCSV(writer io.Writer, opts sql.WriteCSVOpts) error {
var csvRecord []string
for _, kv := range r.KVS {
var columnValue string
@@ -136,9 +136,10 @@ func (r *Record) WriteCSV(writer io.Writer, fieldDelimiter rune, quote rune, alw
}
w := csv.NewWriter(writer)
w.Comma = fieldDelimiter
w.Quote = quote
w.AlwaysQuote = alwaysQuote
w.Comma = opts.FieldDelimiter
w.Quote = opts.Quote
w.AlwaysQuote = opts.AlwaysQuote
w.QuoteEscape = opts.QuoteEscape
if err := w.Write(csvRecord); err != nil {
return err
}

View File

@@ -353,10 +353,13 @@ func (s3Select *S3Select) marshal(buf *bytes.Buffer, record sql.Record) error {
}()
bufioWriter.Reset(buf)
err := record.WriteCSV(bufioWriter,
[]rune(s3Select.Output.CSVArgs.FieldDelimiter)[0],
[]rune(s3Select.Output.CSVArgs.QuoteCharacter)[0],
strings.ToLower(s3Select.Output.CSVArgs.QuoteFields) == "always")
opts := sql.WriteCSVOpts{
FieldDelimiter: []rune(s3Select.Output.CSVArgs.FieldDelimiter)[0],
Quote: []rune(s3Select.Output.CSVArgs.QuoteCharacter)[0],
QuoteEscape: []rune(s3Select.Output.CSVArgs.QuoteEscapeCharacter)[0],
AlwaysQuote: strings.ToLower(s3Select.Output.CSVArgs.QuoteFields) == "always",
}
err := record.WriteCSV(bufioWriter, opts)
if err != nil {
return err
}

View File

@@ -25,6 +25,7 @@ import (
"github.com/klauspost/compress/zstd"
"github.com/minio/minio/pkg/s3select/json"
"github.com/minio/minio/pkg/s3select/sql"
"github.com/minio/simdjson-go"
)
@@ -131,11 +132,17 @@ func TestNDJSON(t *testing.T) {
t.Error(err)
}
var gotB, wantB bytes.Buffer
err = rec.WriteCSV(&gotB, ',', '"', false)
opts := sql.WriteCSVOpts{
FieldDelimiter: ',',
Quote: '"',
QuoteEscape: '"',
AlwaysQuote: false,
}
err = rec.WriteCSV(&gotB, opts)
if err != nil {
t.Error(err)
}
err = want.WriteCSV(&wantB, ',', '"', false)
err = want.WriteCSV(&wantB, opts)
if err != nil {
t.Error(err)
}

View File

@@ -141,7 +141,7 @@ func (r *Record) Set(name string, value *sql.Value) (sql.Record, error) {
}
// WriteCSV - encodes to CSV data.
func (r *Record) WriteCSV(writer io.Writer, fieldDelimiter, quote rune, alwaysQuote bool) error {
func (r *Record) WriteCSV(writer io.Writer, opts sql.WriteCSVOpts) error {
csvRecord := make([]string, 0, 10)
var tmp simdjson.Iter
obj := r.object
@@ -173,9 +173,10 @@ allElems:
csvRecord = append(csvRecord, columnValue)
}
w := csv.NewWriter(writer)
w.Comma = fieldDelimiter
w.Quote = quote
w.AlwaysQuote = alwaysQuote
w.Comma = opts.FieldDelimiter
w.Quote = opts.Quote
w.QuoteEscape = opts.QuoteEscape
w.AlwaysQuote = opts.AlwaysQuote
if err := w.Write(csvRecord); err != nil {
return err
}

View File

@@ -39,6 +39,14 @@ const (
SelectFmtParquet
)
// WriteCSVOpts - encapsulates options for Select CSV output
type WriteCSVOpts struct {
FieldDelimiter rune
Quote rune
QuoteEscape rune
AlwaysQuote bool
}
// Record - is a type containing columns and their values.
type Record interface {
Get(name string) (*Value, error)
@@ -46,7 +54,7 @@ type Record interface {
// Set a value.
// Can return a different record type.
Set(name string, value *Value) (Record, error)
WriteCSV(writer io.Writer, fieldDelimiter, quote rune, alwaysQuote bool) error
WriteCSV(writer io.Writer, opts WriteCSVOpts) error
WriteJSON(writer io.Writer) error
// Clone the record and if possible use the destination provided.