mirror of
https://github.com/minio/minio.git
synced 2024-12-24 06:05:55 -05:00
Import CSV parser library (#8927)
The CSV library code is imported from Go 1.13.6
This commit is contained in:
parent
15e2ea2c96
commit
de924605a1
131
pkg/csvparser/example_test.go
Normal file
131
pkg/csvparser/example_test.go
Normal file
@ -0,0 +1,131 @@
|
||||
// Copyright 2015 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in https://golang.org/LICENSE
|
||||
|
||||
package csv_test
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func ExampleReader() {
|
||||
in := `first_name,last_name,username
|
||||
"Rob","Pike",rob
|
||||
Ken,Thompson,ken
|
||||
"Robert","Griesemer","gri"
|
||||
`
|
||||
r := csv.NewReader(strings.NewReader(in))
|
||||
|
||||
for {
|
||||
record, err := r.Read()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
fmt.Println(record)
|
||||
}
|
||||
// Output:
|
||||
// [first_name last_name username]
|
||||
// [Rob Pike rob]
|
||||
// [Ken Thompson ken]
|
||||
// [Robert Griesemer gri]
|
||||
}
|
||||
|
||||
// This example shows how csv.Reader can be configured to handle other
|
||||
// types of CSV files.
|
||||
func ExampleReader_options() {
|
||||
in := `first_name;last_name;username
|
||||
"Rob";"Pike";rob
|
||||
# lines beginning with a # character are ignored
|
||||
Ken;Thompson;ken
|
||||
"Robert";"Griesemer";"gri"
|
||||
`
|
||||
r := csv.NewReader(strings.NewReader(in))
|
||||
r.Comma = ';'
|
||||
r.Comment = '#'
|
||||
|
||||
records, err := r.ReadAll()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
fmt.Print(records)
|
||||
// Output:
|
||||
// [[first_name last_name username] [Rob Pike rob] [Ken Thompson ken] [Robert Griesemer gri]]
|
||||
}
|
||||
|
||||
func ExampleReader_ReadAll() {
|
||||
in := `first_name,last_name,username
|
||||
"Rob","Pike",rob
|
||||
Ken,Thompson,ken
|
||||
"Robert","Griesemer","gri"
|
||||
`
|
||||
r := csv.NewReader(strings.NewReader(in))
|
||||
|
||||
records, err := r.ReadAll()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
fmt.Print(records)
|
||||
// Output:
|
||||
// [[first_name last_name username] [Rob Pike rob] [Ken Thompson ken] [Robert Griesemer gri]]
|
||||
}
|
||||
|
||||
func ExampleWriter() {
|
||||
records := [][]string{
|
||||
{"first_name", "last_name", "username"},
|
||||
{"Rob", "Pike", "rob"},
|
||||
{"Ken", "Thompson", "ken"},
|
||||
{"Robert", "Griesemer", "gri"},
|
||||
}
|
||||
|
||||
w := csv.NewWriter(os.Stdout)
|
||||
|
||||
for _, record := range records {
|
||||
if err := w.Write(record); err != nil {
|
||||
log.Fatalln("error writing record to csv:", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Write any buffered data to the underlying writer (standard output).
|
||||
w.Flush()
|
||||
|
||||
if err := w.Error(); err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
// Output:
|
||||
// first_name,last_name,username
|
||||
// Rob,Pike,rob
|
||||
// Ken,Thompson,ken
|
||||
// Robert,Griesemer,gri
|
||||
}
|
||||
|
||||
func ExampleWriter_WriteAll() {
|
||||
records := [][]string{
|
||||
{"first_name", "last_name", "username"},
|
||||
{"Rob", "Pike", "rob"},
|
||||
{"Ken", "Thompson", "ken"},
|
||||
{"Robert", "Griesemer", "gri"},
|
||||
}
|
||||
|
||||
w := csv.NewWriter(os.Stdout)
|
||||
w.WriteAll(records) // calls Flush internally
|
||||
|
||||
if err := w.Error(); err != nil {
|
||||
log.Fatalln("error writing csv:", err)
|
||||
}
|
||||
// Output:
|
||||
// first_name,last_name,username
|
||||
// Rob,Pike,rob
|
||||
// Ken,Thompson,ken
|
||||
// Robert,Griesemer,gri
|
||||
}
|
70
pkg/csvparser/fuzz.go
Normal file
70
pkg/csvparser/fuzz.go
Normal file
@ -0,0 +1,70 @@
|
||||
// Copyright 2019 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in https://golang.org/LICENSE
|
||||
|
||||
// +build gofuzz
|
||||
|
||||
package csv
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"reflect"
|
||||
)
|
||||
|
||||
func Fuzz(data []byte) int {
|
||||
score := 0
|
||||
buf := new(bytes.Buffer)
|
||||
|
||||
for _, tt := range []Reader{
|
||||
{},
|
||||
{Comma: ';'},
|
||||
{Comma: '\t'},
|
||||
{LazyQuotes: true},
|
||||
{TrimLeadingSpace: true},
|
||||
{Comment: '#'},
|
||||
{Comment: ';'},
|
||||
} {
|
||||
r := NewReader(bytes.NewReader(data))
|
||||
r.Comma = tt.Comma
|
||||
r.Comment = tt.Comment
|
||||
r.LazyQuotes = tt.LazyQuotes
|
||||
r.TrimLeadingSpace = tt.TrimLeadingSpace
|
||||
|
||||
records, err := r.ReadAll()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
score = 1
|
||||
|
||||
buf.Reset()
|
||||
w := NewWriter(buf)
|
||||
w.Comma = tt.Comma
|
||||
err = w.WriteAll(records)
|
||||
if err != nil {
|
||||
fmt.Printf("writer = %#v\n", w)
|
||||
fmt.Printf("records = %v\n", records)
|
||||
panic(err)
|
||||
}
|
||||
|
||||
r = NewReader(buf)
|
||||
r.Comma = tt.Comma
|
||||
r.Comment = tt.Comment
|
||||
r.LazyQuotes = tt.LazyQuotes
|
||||
r.TrimLeadingSpace = tt.TrimLeadingSpace
|
||||
result, err := r.ReadAll()
|
||||
if err != nil {
|
||||
fmt.Printf("reader = %#v\n", r)
|
||||
fmt.Printf("records = %v\n", records)
|
||||
panic(err)
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(records, result) {
|
||||
fmt.Println("records = \n", records)
|
||||
fmt.Println("result = \n", records)
|
||||
panic("not equal")
|
||||
}
|
||||
}
|
||||
|
||||
return score
|
||||
}
|
402
pkg/csvparser/reader.go
Normal file
402
pkg/csvparser/reader.go
Normal file
@ -0,0 +1,402 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in https://golang.org/LICENSE
|
||||
|
||||
// Package csv reads and writes comma-separated values (CSV) files.
|
||||
// There are many kinds of CSV files; this package supports the format
|
||||
// described in RFC 4180.
|
||||
//
|
||||
// A csv file contains zero or more records of one or more fields per record.
|
||||
// Each record is separated by the newline character. The final record may
|
||||
// optionally be followed by a newline character.
|
||||
//
|
||||
// field1,field2,field3
|
||||
//
|
||||
// White space is considered part of a field.
|
||||
//
|
||||
// Carriage returns before newline characters are silently removed.
|
||||
//
|
||||
// Blank lines are ignored. A line with only whitespace characters (excluding
|
||||
// the ending newline character) is not considered a blank line.
|
||||
//
|
||||
// Fields which start and stop with the quote character " are called
|
||||
// quoted-fields. The beginning and ending quote are not part of the
|
||||
// field.
|
||||
//
|
||||
// The source:
|
||||
//
|
||||
// normal string,"quoted-field"
|
||||
//
|
||||
// results in the fields
|
||||
//
|
||||
// {`normal string`, `quoted-field`}
|
||||
//
|
||||
// Within a quoted-field a quote character followed by a second quote
|
||||
// character is considered a single quote.
|
||||
//
|
||||
// "the ""word"" is true","a ""quoted-field"""
|
||||
//
|
||||
// results in
|
||||
//
|
||||
// {`the "word" is true`, `a "quoted-field"`}
|
||||
//
|
||||
// Newlines and commas may be included in a quoted-field
|
||||
//
|
||||
// "Multi-line
|
||||
// field","comma is ,"
|
||||
//
|
||||
// results in
|
||||
//
|
||||
// {`Multi-line
|
||||
// field`, `comma is ,`}
|
||||
package csv
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// A ParseError is returned for parsing errors.
|
||||
// Line numbers are 1-indexed and columns are 0-indexed.
|
||||
type ParseError struct {
|
||||
StartLine int // Line where the record starts
|
||||
Line int // Line where the error occurred
|
||||
Column int // Column (rune index) where the error occurred
|
||||
Err error // The actual error
|
||||
}
|
||||
|
||||
func (e *ParseError) Error() string {
|
||||
if e.Err == ErrFieldCount {
|
||||
return fmt.Sprintf("record on line %d: %v", e.Line, e.Err)
|
||||
}
|
||||
if e.StartLine != e.Line {
|
||||
return fmt.Sprintf("record on line %d; parse error on line %d, column %d: %v", e.StartLine, e.Line, e.Column, e.Err)
|
||||
}
|
||||
return fmt.Sprintf("parse error on line %d, column %d: %v", e.Line, e.Column, e.Err)
|
||||
}
|
||||
|
||||
// Unwrap returns the underlying error
|
||||
func (e *ParseError) Unwrap() error { return e.Err }
|
||||
|
||||
// These are the errors that can be returned in ParseError.Err.
|
||||
var (
|
||||
ErrTrailingComma = errors.New("extra delimiter at end of line") // Deprecated: No longer used.
|
||||
ErrBareQuote = errors.New("bare \" in non-quoted-field")
|
||||
ErrQuote = errors.New("extraneous or missing \" in quoted-field")
|
||||
ErrFieldCount = errors.New("wrong number of fields")
|
||||
)
|
||||
|
||||
var errInvalidDelim = errors.New("csv: invalid field or comment delimiter")
|
||||
|
||||
func validDelim(r rune) bool {
|
||||
return r != 0 && r != '"' && r != '\r' && r != '\n' && utf8.ValidRune(r) && r != utf8.RuneError
|
||||
}
|
||||
|
||||
// A Reader reads records from a CSV-encoded file.
|
||||
//
|
||||
// As returned by NewReader, a Reader expects input conforming to RFC 4180.
|
||||
// The exported fields can be changed to customize the details before the
|
||||
// first call to Read or ReadAll.
|
||||
//
|
||||
// The Reader converts all \r\n sequences in its input to plain \n,
|
||||
// including in multiline field values, so that the returned data does
|
||||
// not depend on which line-ending convention an input file uses.
|
||||
type Reader struct {
|
||||
// Comma is the field delimiter.
|
||||
// It is set to comma (',') by NewReader.
|
||||
// Comma must be a valid rune and must not be \r, \n,
|
||||
// or the Unicode replacement character (0xFFFD).
|
||||
Comma rune
|
||||
|
||||
// Comment, if not 0, is the comment character. Lines beginning with the
|
||||
// Comment character without preceding whitespace are ignored.
|
||||
// With leading whitespace the Comment character becomes part of the
|
||||
// field, even if TrimLeadingSpace is true.
|
||||
// Comment must be a valid rune and must not be \r, \n,
|
||||
// or the Unicode replacement character (0xFFFD).
|
||||
// It must also not be equal to Comma.
|
||||
Comment rune
|
||||
|
||||
// FieldsPerRecord is the number of expected fields per record.
|
||||
// If FieldsPerRecord is positive, Read requires each record to
|
||||
// have the given number of fields. If FieldsPerRecord is 0, Read sets it to
|
||||
// the number of fields in the first record, so that future records must
|
||||
// have the same field count. If FieldsPerRecord is negative, no check is
|
||||
// made and records may have a variable number of fields.
|
||||
FieldsPerRecord int
|
||||
|
||||
// If LazyQuotes is true, a quote may appear in an unquoted field and a
|
||||
// non-doubled quote may appear in a quoted field.
|
||||
LazyQuotes bool
|
||||
|
||||
// If TrimLeadingSpace is true, leading white space in a field is ignored.
|
||||
// This is done even if the field delimiter, Comma, is white space.
|
||||
TrimLeadingSpace bool
|
||||
|
||||
// ReuseRecord controls whether calls to Read may return a slice sharing
|
||||
// the backing array of the previous call's returned slice for performance.
|
||||
// By default, each call to Read returns newly allocated memory owned by the caller.
|
||||
ReuseRecord bool
|
||||
|
||||
TrailingComma bool // Deprecated: No longer used.
|
||||
|
||||
r *bufio.Reader
|
||||
|
||||
// numLine is the current line being read in the CSV file.
|
||||
numLine int
|
||||
|
||||
// rawBuffer is a line buffer only used by the readLine method.
|
||||
rawBuffer []byte
|
||||
|
||||
// recordBuffer holds the unescaped fields, one after another.
|
||||
// The fields can be accessed by using the indexes in fieldIndexes.
|
||||
// E.g., For the row `a,"b","c""d",e`, recordBuffer will contain `abc"de`
|
||||
// and fieldIndexes will contain the indexes [1, 2, 5, 6].
|
||||
recordBuffer []byte
|
||||
|
||||
// fieldIndexes is an index of fields inside recordBuffer.
|
||||
// The i'th field ends at offset fieldIndexes[i] in recordBuffer.
|
||||
fieldIndexes []int
|
||||
|
||||
// lastRecord is a record cache and only used when ReuseRecord == true.
|
||||
lastRecord []string
|
||||
}
|
||||
|
||||
// NewReader returns a new Reader that reads from r.
|
||||
func NewReader(r io.Reader) *Reader {
|
||||
return &Reader{
|
||||
Comma: ',',
|
||||
r: bufio.NewReader(r),
|
||||
}
|
||||
}
|
||||
|
||||
// Read reads one record (a slice of fields) from r.
|
||||
// If the record has an unexpected number of fields,
|
||||
// Read returns the record along with the error ErrFieldCount.
|
||||
// Except for that case, Read always returns either a non-nil
|
||||
// record or a non-nil error, but not both.
|
||||
// If there is no data left to be read, Read returns nil, io.EOF.
|
||||
// If ReuseRecord is true, the returned slice may be shared
|
||||
// between multiple calls to Read.
|
||||
func (r *Reader) Read() (record []string, err error) {
|
||||
if r.ReuseRecord {
|
||||
record, err = r.readRecord(r.lastRecord)
|
||||
r.lastRecord = record
|
||||
} else {
|
||||
record, err = r.readRecord(nil)
|
||||
}
|
||||
return record, err
|
||||
}
|
||||
|
||||
// ReadAll reads all the remaining records from r.
|
||||
// Each record is a slice of fields.
|
||||
// A successful call returns err == nil, not err == io.EOF. Because ReadAll is
|
||||
// defined to read until EOF, it does not treat end of file as an error to be
|
||||
// reported.
|
||||
func (r *Reader) ReadAll() (records [][]string, err error) {
|
||||
for {
|
||||
record, err := r.readRecord(nil)
|
||||
if err == io.EOF {
|
||||
return records, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
records = append(records, record)
|
||||
}
|
||||
}
|
||||
|
||||
// readLine reads the next line (with the trailing endline).
|
||||
// If EOF is hit without a trailing endline, it will be omitted.
|
||||
// If some bytes were read, then the error is never io.EOF.
|
||||
// The result is only valid until the next call to readLine.
|
||||
func (r *Reader) readLine() ([]byte, error) {
|
||||
line, err := r.r.ReadSlice('\n')
|
||||
if err == bufio.ErrBufferFull {
|
||||
r.rawBuffer = append(r.rawBuffer[:0], line...)
|
||||
for err == bufio.ErrBufferFull {
|
||||
line, err = r.r.ReadSlice('\n')
|
||||
r.rawBuffer = append(r.rawBuffer, line...)
|
||||
}
|
||||
line = r.rawBuffer
|
||||
}
|
||||
if len(line) > 0 && err == io.EOF {
|
||||
err = nil
|
||||
// For backwards compatibility, drop trailing \r before EOF.
|
||||
if line[len(line)-1] == '\r' {
|
||||
line = line[:len(line)-1]
|
||||
}
|
||||
}
|
||||
r.numLine++
|
||||
// Normalize \r\n to \n on all input lines.
|
||||
if n := len(line); n >= 2 && line[n-2] == '\r' && line[n-1] == '\n' {
|
||||
line[n-2] = '\n'
|
||||
line = line[:n-1]
|
||||
}
|
||||
return line, err
|
||||
}
|
||||
|
||||
// lengthNL reports the number of bytes for the trailing \n.
|
||||
func lengthNL(b []byte) int {
|
||||
if len(b) > 0 && b[len(b)-1] == '\n' {
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// nextRune returns the next rune in b or utf8.RuneError.
|
||||
func nextRune(b []byte) rune {
|
||||
r, _ := utf8.DecodeRune(b)
|
||||
return r
|
||||
}
|
||||
|
||||
func (r *Reader) readRecord(dst []string) ([]string, error) {
|
||||
if r.Comma == r.Comment || !validDelim(r.Comma) || (r.Comment != 0 && !validDelim(r.Comment)) {
|
||||
return nil, errInvalidDelim
|
||||
}
|
||||
|
||||
// Read line (automatically skipping past empty lines and any comments).
|
||||
var line, fullLine []byte
|
||||
var errRead error
|
||||
for errRead == nil {
|
||||
line, errRead = r.readLine()
|
||||
if r.Comment != 0 && nextRune(line) == r.Comment {
|
||||
line = nil
|
||||
continue // Skip comment lines
|
||||
}
|
||||
if errRead == nil && len(line) == lengthNL(line) {
|
||||
line = nil
|
||||
continue // Skip empty lines
|
||||
}
|
||||
fullLine = line
|
||||
break
|
||||
}
|
||||
if errRead == io.EOF {
|
||||
return nil, errRead
|
||||
}
|
||||
|
||||
// Parse each field in the record.
|
||||
var err error
|
||||
const quoteLen = len(`"`)
|
||||
commaLen := utf8.RuneLen(r.Comma)
|
||||
recLine := r.numLine // Starting line for record
|
||||
r.recordBuffer = r.recordBuffer[:0]
|
||||
r.fieldIndexes = r.fieldIndexes[:0]
|
||||
parseField:
|
||||
for {
|
||||
if r.TrimLeadingSpace {
|
||||
line = bytes.TrimLeftFunc(line, unicode.IsSpace)
|
||||
}
|
||||
if len(line) == 0 || line[0] != '"' {
|
||||
// Non-quoted string field
|
||||
i := bytes.IndexRune(line, r.Comma)
|
||||
field := line
|
||||
if i >= 0 {
|
||||
field = field[:i]
|
||||
} else {
|
||||
field = field[:len(field)-lengthNL(field)]
|
||||
}
|
||||
// Check to make sure a quote does not appear in field.
|
||||
if !r.LazyQuotes {
|
||||
if j := bytes.IndexByte(field, '"'); j >= 0 {
|
||||
col := utf8.RuneCount(fullLine[:len(fullLine)-len(line[j:])])
|
||||
err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrBareQuote}
|
||||
break parseField
|
||||
}
|
||||
}
|
||||
r.recordBuffer = append(r.recordBuffer, field...)
|
||||
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
|
||||
if i >= 0 {
|
||||
line = line[i+commaLen:]
|
||||
continue parseField
|
||||
}
|
||||
break parseField
|
||||
} else {
|
||||
// Quoted string field
|
||||
line = line[quoteLen:]
|
||||
for {
|
||||
i := bytes.IndexByte(line, '"')
|
||||
if i >= 0 {
|
||||
// Hit next quote.
|
||||
r.recordBuffer = append(r.recordBuffer, line[:i]...)
|
||||
line = line[i+quoteLen:]
|
||||
switch rn := nextRune(line); {
|
||||
case rn == '"':
|
||||
// `""` sequence (append quote).
|
||||
r.recordBuffer = append(r.recordBuffer, '"')
|
||||
line = line[quoteLen:]
|
||||
case rn == r.Comma:
|
||||
// `",` sequence (end of field).
|
||||
line = line[commaLen:]
|
||||
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
|
||||
continue parseField
|
||||
case lengthNL(line) == len(line):
|
||||
// `"\n` sequence (end of line).
|
||||
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
|
||||
break parseField
|
||||
case r.LazyQuotes:
|
||||
// `"` sequence (bare quote).
|
||||
r.recordBuffer = append(r.recordBuffer, '"')
|
||||
default:
|
||||
// `"*` sequence (invalid non-escaped quote).
|
||||
col := utf8.RuneCount(fullLine[:len(fullLine)-len(line)-quoteLen])
|
||||
err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
|
||||
break parseField
|
||||
}
|
||||
} else if len(line) > 0 {
|
||||
// Hit end of line (copy all data so far).
|
||||
r.recordBuffer = append(r.recordBuffer, line...)
|
||||
if errRead != nil {
|
||||
break parseField
|
||||
}
|
||||
line, errRead = r.readLine()
|
||||
if errRead == io.EOF {
|
||||
errRead = nil
|
||||
}
|
||||
fullLine = line
|
||||
} else {
|
||||
// Abrupt end of file (EOF or error).
|
||||
if !r.LazyQuotes && errRead == nil {
|
||||
col := utf8.RuneCount(fullLine)
|
||||
err = &ParseError{StartLine: recLine, Line: r.numLine, Column: col, Err: ErrQuote}
|
||||
break parseField
|
||||
}
|
||||
r.fieldIndexes = append(r.fieldIndexes, len(r.recordBuffer))
|
||||
break parseField
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if err == nil {
|
||||
err = errRead
|
||||
}
|
||||
|
||||
// Create a single string and create slices out of it.
|
||||
// This pins the memory of the fields together, but allocates once.
|
||||
str := string(r.recordBuffer) // Convert to string once to batch allocations
|
||||
dst = dst[:0]
|
||||
if cap(dst) < len(r.fieldIndexes) {
|
||||
dst = make([]string, len(r.fieldIndexes))
|
||||
}
|
||||
dst = dst[:len(r.fieldIndexes)]
|
||||
var preIdx int
|
||||
for i, idx := range r.fieldIndexes {
|
||||
dst[i] = str[preIdx:idx]
|
||||
preIdx = idx
|
||||
}
|
||||
|
||||
// Check or update the expected fields per record.
|
||||
if r.FieldsPerRecord > 0 {
|
||||
if len(dst) != r.FieldsPerRecord && err == nil {
|
||||
err = &ParseError{StartLine: recLine, Line: recLine, Err: ErrFieldCount}
|
||||
}
|
||||
} else if r.FieldsPerRecord == 0 {
|
||||
r.FieldsPerRecord = len(dst)
|
||||
}
|
||||
return dst, err
|
||||
}
|
509
pkg/csvparser/reader_test.go
Normal file
509
pkg/csvparser/reader_test.go
Normal file
@ -0,0 +1,509 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in https://golang.org/LICENSE
|
||||
|
||||
package csv
|
||||
|
||||
import (
|
||||
"io"
|
||||
"reflect"
|
||||
"strings"
|
||||
"testing"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func TestRead(t *testing.T) {
|
||||
tests := []struct {
|
||||
Name string
|
||||
Input string
|
||||
Output [][]string
|
||||
Error error
|
||||
|
||||
// These fields are copied into the Reader
|
||||
Comma rune
|
||||
Comment rune
|
||||
UseFieldsPerRecord bool // false (default) means FieldsPerRecord is -1
|
||||
FieldsPerRecord int
|
||||
LazyQuotes bool
|
||||
TrimLeadingSpace bool
|
||||
ReuseRecord bool
|
||||
}{{
|
||||
Name: "Simple",
|
||||
Input: "a,b,c\n",
|
||||
Output: [][]string{{"a", "b", "c"}},
|
||||
}, {
|
||||
Name: "CRLF",
|
||||
Input: "a,b\r\nc,d\r\n",
|
||||
Output: [][]string{{"a", "b"}, {"c", "d"}},
|
||||
}, {
|
||||
Name: "BareCR",
|
||||
Input: "a,b\rc,d\r\n",
|
||||
Output: [][]string{{"a", "b\rc", "d"}},
|
||||
}, {
|
||||
Name: "RFC4180test",
|
||||
Input: `#field1,field2,field3
|
||||
"aaa","bb
|
||||
b","ccc"
|
||||
"a,a","b""bb","ccc"
|
||||
zzz,yyy,xxx
|
||||
`,
|
||||
Output: [][]string{
|
||||
{"#field1", "field2", "field3"},
|
||||
{"aaa", "bb\nb", "ccc"},
|
||||
{"a,a", `b"bb`, "ccc"},
|
||||
{"zzz", "yyy", "xxx"},
|
||||
},
|
||||
UseFieldsPerRecord: true,
|
||||
FieldsPerRecord: 0,
|
||||
}, {
|
||||
Name: "NoEOLTest",
|
||||
Input: "a,b,c",
|
||||
Output: [][]string{{"a", "b", "c"}},
|
||||
}, {
|
||||
Name: "Semicolon",
|
||||
Input: "a;b;c\n",
|
||||
Output: [][]string{{"a", "b", "c"}},
|
||||
Comma: ';',
|
||||
}, {
|
||||
Name: "MultiLine",
|
||||
Input: `"two
|
||||
line","one line","three
|
||||
line
|
||||
field"`,
|
||||
Output: [][]string{{"two\nline", "one line", "three\nline\nfield"}},
|
||||
}, {
|
||||
Name: "BlankLine",
|
||||
Input: "a,b,c\n\nd,e,f\n\n",
|
||||
Output: [][]string{
|
||||
{"a", "b", "c"},
|
||||
{"d", "e", "f"},
|
||||
},
|
||||
}, {
|
||||
Name: "BlankLineFieldCount",
|
||||
Input: "a,b,c\n\nd,e,f\n\n",
|
||||
Output: [][]string{
|
||||
{"a", "b", "c"},
|
||||
{"d", "e", "f"},
|
||||
},
|
||||
UseFieldsPerRecord: true,
|
||||
FieldsPerRecord: 0,
|
||||
}, {
|
||||
Name: "TrimSpace",
|
||||
Input: " a, b, c\n",
|
||||
Output: [][]string{{"a", "b", "c"}},
|
||||
TrimLeadingSpace: true,
|
||||
}, {
|
||||
Name: "LeadingSpace",
|
||||
Input: " a, b, c\n",
|
||||
Output: [][]string{{" a", " b", " c"}},
|
||||
}, {
|
||||
Name: "Comment",
|
||||
Input: "#1,2,3\na,b,c\n#comment",
|
||||
Output: [][]string{{"a", "b", "c"}},
|
||||
Comment: '#',
|
||||
}, {
|
||||
Name: "NoComment",
|
||||
Input: "#1,2,3\na,b,c",
|
||||
Output: [][]string{{"#1", "2", "3"}, {"a", "b", "c"}},
|
||||
}, {
|
||||
Name: "LazyQuotes",
|
||||
Input: `a "word","1"2",a","b`,
|
||||
Output: [][]string{{`a "word"`, `1"2`, `a"`, `b`}},
|
||||
LazyQuotes: true,
|
||||
}, {
|
||||
Name: "BareQuotes",
|
||||
Input: `a "word","1"2",a"`,
|
||||
Output: [][]string{{`a "word"`, `1"2`, `a"`}},
|
||||
LazyQuotes: true,
|
||||
}, {
|
||||
Name: "BareDoubleQuotes",
|
||||
Input: `a""b,c`,
|
||||
Output: [][]string{{`a""b`, `c`}},
|
||||
LazyQuotes: true,
|
||||
}, {
|
||||
Name: "BadDoubleQuotes",
|
||||
Input: `a""b,c`,
|
||||
Error: &ParseError{StartLine: 1, Line: 1, Column: 1, Err: ErrBareQuote},
|
||||
}, {
|
||||
Name: "TrimQuote",
|
||||
Input: ` "a"," b",c`,
|
||||
Output: [][]string{{"a", " b", "c"}},
|
||||
TrimLeadingSpace: true,
|
||||
}, {
|
||||
Name: "BadBareQuote",
|
||||
Input: `a "word","b"`,
|
||||
Error: &ParseError{StartLine: 1, Line: 1, Column: 2, Err: ErrBareQuote},
|
||||
}, {
|
||||
Name: "BadTrailingQuote",
|
||||
Input: `"a word",b"`,
|
||||
Error: &ParseError{StartLine: 1, Line: 1, Column: 10, Err: ErrBareQuote},
|
||||
}, {
|
||||
Name: "ExtraneousQuote",
|
||||
Input: `"a "word","b"`,
|
||||
Error: &ParseError{StartLine: 1, Line: 1, Column: 3, Err: ErrQuote},
|
||||
}, {
|
||||
Name: "BadFieldCount",
|
||||
Input: "a,b,c\nd,e",
|
||||
Error: &ParseError{StartLine: 2, Line: 2, Err: ErrFieldCount},
|
||||
UseFieldsPerRecord: true,
|
||||
FieldsPerRecord: 0,
|
||||
}, {
|
||||
Name: "BadFieldCount1",
|
||||
Input: `a,b,c`,
|
||||
Error: &ParseError{StartLine: 1, Line: 1, Err: ErrFieldCount},
|
||||
UseFieldsPerRecord: true,
|
||||
FieldsPerRecord: 2,
|
||||
}, {
|
||||
Name: "FieldCount",
|
||||
Input: "a,b,c\nd,e",
|
||||
Output: [][]string{{"a", "b", "c"}, {"d", "e"}},
|
||||
}, {
|
||||
Name: "TrailingCommaEOF",
|
||||
Input: "a,b,c,",
|
||||
Output: [][]string{{"a", "b", "c", ""}},
|
||||
}, {
|
||||
Name: "TrailingCommaEOL",
|
||||
Input: "a,b,c,\n",
|
||||
Output: [][]string{{"a", "b", "c", ""}},
|
||||
}, {
|
||||
Name: "TrailingCommaSpaceEOF",
|
||||
Input: "a,b,c, ",
|
||||
Output: [][]string{{"a", "b", "c", ""}},
|
||||
TrimLeadingSpace: true,
|
||||
}, {
|
||||
Name: "TrailingCommaSpaceEOL",
|
||||
Input: "a,b,c, \n",
|
||||
Output: [][]string{{"a", "b", "c", ""}},
|
||||
TrimLeadingSpace: true,
|
||||
}, {
|
||||
Name: "TrailingCommaLine3",
|
||||
Input: "a,b,c\nd,e,f\ng,hi,",
|
||||
Output: [][]string{{"a", "b", "c"}, {"d", "e", "f"}, {"g", "hi", ""}},
|
||||
TrimLeadingSpace: true,
|
||||
}, {
|
||||
Name: "NotTrailingComma3",
|
||||
Input: "a,b,c, \n",
|
||||
Output: [][]string{{"a", "b", "c", " "}},
|
||||
}, {
|
||||
Name: "CommaFieldTest",
|
||||
Input: `x,y,z,w
|
||||
x,y,z,
|
||||
x,y,,
|
||||
x,,,
|
||||
,,,
|
||||
"x","y","z","w"
|
||||
"x","y","z",""
|
||||
"x","y","",""
|
||||
"x","","",""
|
||||
"","","",""
|
||||
`,
|
||||
Output: [][]string{
|
||||
{"x", "y", "z", "w"},
|
||||
{"x", "y", "z", ""},
|
||||
{"x", "y", "", ""},
|
||||
{"x", "", "", ""},
|
||||
{"", "", "", ""},
|
||||
{"x", "y", "z", "w"},
|
||||
{"x", "y", "z", ""},
|
||||
{"x", "y", "", ""},
|
||||
{"x", "", "", ""},
|
||||
{"", "", "", ""},
|
||||
},
|
||||
}, {
|
||||
Name: "TrailingCommaIneffective1",
|
||||
Input: "a,b,\nc,d,e",
|
||||
Output: [][]string{
|
||||
{"a", "b", ""},
|
||||
{"c", "d", "e"},
|
||||
},
|
||||
TrimLeadingSpace: true,
|
||||
}, {
|
||||
Name: "ReadAllReuseRecord",
|
||||
Input: "a,b\nc,d",
|
||||
Output: [][]string{
|
||||
{"a", "b"},
|
||||
{"c", "d"},
|
||||
},
|
||||
ReuseRecord: true,
|
||||
}, {
|
||||
Name: "StartLine1", // Issue 19019
|
||||
Input: "a,\"b\nc\"d,e",
|
||||
Error: &ParseError{StartLine: 1, Line: 2, Column: 1, Err: ErrQuote},
|
||||
}, {
|
||||
Name: "StartLine2",
|
||||
Input: "a,b\n\"d\n\n,e",
|
||||
Error: &ParseError{StartLine: 2, Line: 5, Column: 0, Err: ErrQuote},
|
||||
}, {
|
||||
Name: "CRLFInQuotedField", // Issue 21201
|
||||
Input: "A,\"Hello\r\nHi\",B\r\n",
|
||||
Output: [][]string{
|
||||
{"A", "Hello\nHi", "B"},
|
||||
},
|
||||
}, {
|
||||
Name: "BinaryBlobField", // Issue 19410
|
||||
Input: "x09\x41\xb4\x1c,aktau",
|
||||
Output: [][]string{{"x09A\xb4\x1c", "aktau"}},
|
||||
}, {
|
||||
Name: "TrailingCR",
|
||||
Input: "field1,field2\r",
|
||||
Output: [][]string{{"field1", "field2"}},
|
||||
}, {
|
||||
Name: "QuotedTrailingCR",
|
||||
Input: "\"field\"\r",
|
||||
Output: [][]string{{"field"}},
|
||||
}, {
|
||||
Name: "QuotedTrailingCRCR",
|
||||
Input: "\"field\"\r\r",
|
||||
Error: &ParseError{StartLine: 1, Line: 1, Column: 6, Err: ErrQuote},
|
||||
}, {
|
||||
Name: "FieldCR",
|
||||
Input: "field\rfield\r",
|
||||
Output: [][]string{{"field\rfield"}},
|
||||
}, {
|
||||
Name: "FieldCRCR",
|
||||
Input: "field\r\rfield\r\r",
|
||||
Output: [][]string{{"field\r\rfield\r"}},
|
||||
}, {
|
||||
Name: "FieldCRCRLF",
|
||||
Input: "field\r\r\nfield\r\r\n",
|
||||
Output: [][]string{{"field\r"}, {"field\r"}},
|
||||
}, {
|
||||
Name: "FieldCRCRLFCR",
|
||||
Input: "field\r\r\n\rfield\r\r\n\r",
|
||||
Output: [][]string{{"field\r"}, {"\rfield\r"}},
|
||||
}, {
|
||||
Name: "FieldCRCRLFCRCR",
|
||||
Input: "field\r\r\n\r\rfield\r\r\n\r\r",
|
||||
Output: [][]string{{"field\r"}, {"\r\rfield\r"}, {"\r"}},
|
||||
}, {
|
||||
Name: "MultiFieldCRCRLFCRCR",
|
||||
Input: "field1,field2\r\r\n\r\rfield1,field2\r\r\n\r\r,",
|
||||
Output: [][]string{
|
||||
{"field1", "field2\r"},
|
||||
{"\r\rfield1", "field2\r"},
|
||||
{"\r\r", ""},
|
||||
},
|
||||
}, {
|
||||
Name: "NonASCIICommaAndComment",
|
||||
Input: "a£b,c£ \td,e\n€ comment\n",
|
||||
Output: [][]string{{"a", "b,c", "d,e"}},
|
||||
TrimLeadingSpace: true,
|
||||
Comma: '£',
|
||||
Comment: '€',
|
||||
}, {
|
||||
Name: "NonASCIICommaAndCommentWithQuotes",
|
||||
Input: "a€\" b,\"€ c\nλ comment\n",
|
||||
Output: [][]string{{"a", " b,", " c"}},
|
||||
Comma: '€',
|
||||
Comment: 'λ',
|
||||
}, {
|
||||
// λ and θ start with the same byte.
|
||||
// This tests that the parser doesn't confuse such characters.
|
||||
Name: "NonASCIICommaConfusion",
|
||||
Input: "\"abθcd\"λefθgh",
|
||||
Output: [][]string{{"abθcd", "efθgh"}},
|
||||
Comma: 'λ',
|
||||
Comment: '€',
|
||||
}, {
|
||||
Name: "NonASCIICommentConfusion",
|
||||
Input: "λ\nλ\nθ\nλ\n",
|
||||
Output: [][]string{{"λ"}, {"λ"}, {"λ"}},
|
||||
Comment: 'θ',
|
||||
}, {
|
||||
Name: "QuotedFieldMultipleLF",
|
||||
Input: "\"\n\n\n\n\"",
|
||||
Output: [][]string{{"\n\n\n\n"}},
|
||||
}, {
|
||||
Name: "MultipleCRLF",
|
||||
Input: "\r\n\r\n\r\n\r\n",
|
||||
}, {
|
||||
// The implementation may read each line in several chunks if it doesn't fit entirely
|
||||
// in the read buffer, so we should test the code to handle that condition.
|
||||
Name: "HugeLines",
|
||||
Input: strings.Repeat("#ignore\n", 10000) + strings.Repeat("@", 5000) + "," + strings.Repeat("*", 5000),
|
||||
Output: [][]string{{strings.Repeat("@", 5000), strings.Repeat("*", 5000)}},
|
||||
Comment: '#',
|
||||
}, {
|
||||
Name: "QuoteWithTrailingCRLF",
|
||||
Input: "\"foo\"bar\"\r\n",
|
||||
Error: &ParseError{StartLine: 1, Line: 1, Column: 4, Err: ErrQuote},
|
||||
}, {
|
||||
Name: "LazyQuoteWithTrailingCRLF",
|
||||
Input: "\"foo\"bar\"\r\n",
|
||||
Output: [][]string{{`foo"bar`}},
|
||||
LazyQuotes: true,
|
||||
}, {
|
||||
Name: "DoubleQuoteWithTrailingCRLF",
|
||||
Input: "\"foo\"\"bar\"\r\n",
|
||||
Output: [][]string{{`foo"bar`}},
|
||||
}, {
|
||||
Name: "EvenQuotes",
|
||||
Input: `""""""""`,
|
||||
Output: [][]string{{`"""`}},
|
||||
}, {
|
||||
Name: "OddQuotes",
|
||||
Input: `"""""""`,
|
||||
Error: &ParseError{StartLine: 1, Line: 1, Column: 7, Err: ErrQuote},
|
||||
}, {
|
||||
Name: "LazyOddQuotes",
|
||||
Input: `"""""""`,
|
||||
Output: [][]string{{`"""`}},
|
||||
LazyQuotes: true,
|
||||
}, {
|
||||
Name: "BadComma1",
|
||||
Comma: '\n',
|
||||
Error: errInvalidDelim,
|
||||
}, {
|
||||
Name: "BadComma2",
|
||||
Comma: '\r',
|
||||
Error: errInvalidDelim,
|
||||
}, {
|
||||
Name: "BadComma3",
|
||||
Comma: '"',
|
||||
Error: errInvalidDelim,
|
||||
}, {
|
||||
Name: "BadComma4",
|
||||
Comma: utf8.RuneError,
|
||||
Error: errInvalidDelim,
|
||||
}, {
|
||||
Name: "BadComment1",
|
||||
Comment: '\n',
|
||||
Error: errInvalidDelim,
|
||||
}, {
|
||||
Name: "BadComment2",
|
||||
Comment: '\r',
|
||||
Error: errInvalidDelim,
|
||||
}, {
|
||||
Name: "BadComment3",
|
||||
Comment: utf8.RuneError,
|
||||
Error: errInvalidDelim,
|
||||
}, {
|
||||
Name: "BadCommaComment",
|
||||
Comma: 'X',
|
||||
Comment: 'X',
|
||||
Error: errInvalidDelim,
|
||||
}}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.Name, func(t *testing.T) {
|
||||
r := NewReader(strings.NewReader(tt.Input))
|
||||
|
||||
if tt.Comma != 0 {
|
||||
r.Comma = tt.Comma
|
||||
}
|
||||
r.Comment = tt.Comment
|
||||
if tt.UseFieldsPerRecord {
|
||||
r.FieldsPerRecord = tt.FieldsPerRecord
|
||||
} else {
|
||||
r.FieldsPerRecord = -1
|
||||
}
|
||||
r.LazyQuotes = tt.LazyQuotes
|
||||
r.TrimLeadingSpace = tt.TrimLeadingSpace
|
||||
r.ReuseRecord = tt.ReuseRecord
|
||||
|
||||
out, err := r.ReadAll()
|
||||
if !reflect.DeepEqual(err, tt.Error) {
|
||||
t.Errorf("ReadAll() error:\ngot %v\nwant %v", err, tt.Error)
|
||||
} else if !reflect.DeepEqual(out, tt.Output) {
|
||||
t.Errorf("ReadAll() output:\ngot %q\nwant %q", out, tt.Output)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// nTimes is an io.Reader which yields the string s n times.
|
||||
type nTimes struct {
|
||||
s string
|
||||
n int
|
||||
off int
|
||||
}
|
||||
|
||||
func (r *nTimes) Read(p []byte) (n int, err error) {
|
||||
for {
|
||||
if r.n <= 0 || r.s == "" {
|
||||
return n, io.EOF
|
||||
}
|
||||
n0 := copy(p, r.s[r.off:])
|
||||
p = p[n0:]
|
||||
n += n0
|
||||
r.off += n0
|
||||
if r.off == len(r.s) {
|
||||
r.off = 0
|
||||
r.n--
|
||||
}
|
||||
if len(p) == 0 {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// benchmarkRead measures reading the provided CSV rows data.
|
||||
// initReader, if non-nil, modifies the Reader before it's used.
|
||||
func benchmarkRead(b *testing.B, initReader func(*Reader), rows string) {
|
||||
b.ReportAllocs()
|
||||
r := NewReader(&nTimes{s: rows, n: b.N})
|
||||
if initReader != nil {
|
||||
initReader(r)
|
||||
}
|
||||
for {
|
||||
_, err := r.Read()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
b.Fatal(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const benchmarkCSVData = `x,y,z,w
|
||||
x,y,z,
|
||||
x,y,,
|
||||
x,,,
|
||||
,,,
|
||||
"x","y","z","w"
|
||||
"x","y","z",""
|
||||
"x","y","",""
|
||||
"x","","",""
|
||||
"","","",""
|
||||
`
|
||||
|
||||
func BenchmarkRead(b *testing.B) {
|
||||
benchmarkRead(b, nil, benchmarkCSVData)
|
||||
}
|
||||
|
||||
func BenchmarkReadWithFieldsPerRecord(b *testing.B) {
|
||||
benchmarkRead(b, func(r *Reader) { r.FieldsPerRecord = 4 }, benchmarkCSVData)
|
||||
}
|
||||
|
||||
func BenchmarkReadWithoutFieldsPerRecord(b *testing.B) {
|
||||
benchmarkRead(b, func(r *Reader) { r.FieldsPerRecord = -1 }, benchmarkCSVData)
|
||||
}
|
||||
|
||||
func BenchmarkReadLargeFields(b *testing.B) {
|
||||
benchmarkRead(b, nil, strings.Repeat(`xxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
|
||||
xxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvv
|
||||
,,zzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
|
||||
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
|
||||
`, 3))
|
||||
}
|
||||
|
||||
func BenchmarkReadReuseRecord(b *testing.B) {
|
||||
benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true }, benchmarkCSVData)
|
||||
}
|
||||
|
||||
func BenchmarkReadReuseRecordWithFieldsPerRecord(b *testing.B) {
|
||||
benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true; r.FieldsPerRecord = 4 }, benchmarkCSVData)
|
||||
}
|
||||
|
||||
func BenchmarkReadReuseRecordWithoutFieldsPerRecord(b *testing.B) {
|
||||
benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true; r.FieldsPerRecord = -1 }, benchmarkCSVData)
|
||||
}
|
||||
|
||||
func BenchmarkReadReuseRecordLargeFields(b *testing.B) {
|
||||
benchmarkRead(b, func(r *Reader) { r.ReuseRecord = true }, strings.Repeat(`xxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
|
||||
xxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvv
|
||||
,,zzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
|
||||
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx,yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz,wwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwwww,vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
|
||||
`, 3))
|
||||
}
|
167
pkg/csvparser/writer.go
Normal file
167
pkg/csvparser/writer.go
Normal file
@ -0,0 +1,167 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in https://golang.org/LICENSE
|
||||
|
||||
package csv
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"io"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// A Writer writes records using CSV encoding.
|
||||
//
|
||||
// As returned by NewWriter, a Writer writes records terminated by a
|
||||
// newline and uses ',' as the field delimiter. The exported fields can be
|
||||
// changed to customize the details before the first call to Write or WriteAll.
|
||||
//
|
||||
// Comma is the field delimiter.
|
||||
//
|
||||
// If UseCRLF is true, the Writer ends each output line with \r\n instead of \n.
|
||||
//
|
||||
// The writes of individual records are buffered.
|
||||
// After all data has been written, the client should call the
|
||||
// Flush method to guarantee all data has been forwarded to
|
||||
// the underlying io.Writer. Any errors that occurred should
|
||||
// be checked by calling the Error method.
|
||||
type Writer struct {
|
||||
Comma rune // Field delimiter (set to ',' by NewWriter)
|
||||
UseCRLF bool // True to use \r\n as the line terminator
|
||||
w *bufio.Writer
|
||||
}
|
||||
|
||||
// NewWriter returns a new Writer that writes to w.
|
||||
func NewWriter(w io.Writer) *Writer {
|
||||
return &Writer{
|
||||
Comma: ',',
|
||||
w: bufio.NewWriter(w),
|
||||
}
|
||||
}
|
||||
|
||||
// Write writes a single CSV record to w along with any necessary quoting.
|
||||
// A record is a slice of strings with each string being one field.
|
||||
// Writes are buffered, so Flush must eventually be called to ensure
|
||||
// that the record is written to the underlying io.Writer.
|
||||
func (w *Writer) Write(record []string) error {
|
||||
if !validDelim(w.Comma) {
|
||||
return errInvalidDelim
|
||||
}
|
||||
|
||||
for n, field := range record {
|
||||
if n > 0 {
|
||||
if _, err := w.w.WriteRune(w.Comma); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// If we don't have to have a quoted field then just
|
||||
// write out the field and continue to the next field.
|
||||
if !w.fieldNeedsQuotes(field) {
|
||||
if _, err := w.w.WriteString(field); err != nil {
|
||||
return err
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
if err := w.w.WriteByte('"'); err != nil {
|
||||
return err
|
||||
}
|
||||
for len(field) > 0 {
|
||||
// Search for special characters.
|
||||
i := strings.IndexAny(field, "\"\r\n")
|
||||
if i < 0 {
|
||||
i = len(field)
|
||||
}
|
||||
|
||||
// Copy verbatim everything before the special character.
|
||||
if _, err := w.w.WriteString(field[:i]); err != nil {
|
||||
return err
|
||||
}
|
||||
field = field[i:]
|
||||
|
||||
// Encode the special character.
|
||||
if len(field) > 0 {
|
||||
var err error
|
||||
switch field[0] {
|
||||
case '"':
|
||||
_, err = w.w.WriteString(`""`)
|
||||
case '\r':
|
||||
if !w.UseCRLF {
|
||||
err = w.w.WriteByte('\r')
|
||||
}
|
||||
case '\n':
|
||||
if w.UseCRLF {
|
||||
_, err = w.w.WriteString("\r\n")
|
||||
} else {
|
||||
err = w.w.WriteByte('\n')
|
||||
}
|
||||
}
|
||||
field = field[1:]
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
if err := w.w.WriteByte('"'); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
var err error
|
||||
if w.UseCRLF {
|
||||
_, err = w.w.WriteString("\r\n")
|
||||
} else {
|
||||
err = w.w.WriteByte('\n')
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// Flush writes any buffered data to the underlying io.Writer.
|
||||
// To check if an error occurred during the Flush, call Error.
|
||||
func (w *Writer) Flush() {
|
||||
w.w.Flush()
|
||||
}
|
||||
|
||||
// Error reports any error that has occurred during a previous Write or Flush.
|
||||
func (w *Writer) Error() error {
|
||||
_, err := w.w.Write(nil)
|
||||
return err
|
||||
}
|
||||
|
||||
// WriteAll writes multiple CSV records to w using Write and then calls Flush,
|
||||
// returning any error from the Flush.
|
||||
func (w *Writer) WriteAll(records [][]string) error {
|
||||
for _, record := range records {
|
||||
err := w.Write(record)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return w.w.Flush()
|
||||
}
|
||||
|
||||
// fieldNeedsQuotes reports whether our field must be enclosed in quotes.
|
||||
// Fields with a Comma, fields with a quote or newline, and
|
||||
// fields which start with a space must be enclosed in quotes.
|
||||
// We used to quote empty strings, but we do not anymore (as of Go 1.4).
|
||||
// The two representations should be equivalent, but Postgres distinguishes
|
||||
// quoted vs non-quoted empty string during database imports, and it has
|
||||
// an option to force the quoted behavior for non-quoted CSV but it has
|
||||
// no option to force the non-quoted behavior for quoted CSV, making
|
||||
// CSV with quoted empty strings strictly less useful.
|
||||
// Not quoting the empty string also makes this package match the behavior
|
||||
// of Microsoft Excel and Google Drive.
|
||||
// For Postgres, quote the data terminating string `\.`.
|
||||
func (w *Writer) fieldNeedsQuotes(field string) bool {
|
||||
if field == "" {
|
||||
return false
|
||||
}
|
||||
if field == `\.` || strings.ContainsRune(field, w.Comma) || strings.ContainsAny(field, "\"\r\n") {
|
||||
return true
|
||||
}
|
||||
|
||||
r1, _ := utf8.DecodeRuneInString(field)
|
||||
return unicode.IsSpace(r1)
|
||||
}
|
95
pkg/csvparser/writer_test.go
Normal file
95
pkg/csvparser/writer_test.go
Normal file
@ -0,0 +1,95 @@
|
||||
// Copyright 2011 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in https://golang.org/LICENSE
|
||||
|
||||
package csv
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
|
||||
var writeTests = []struct {
|
||||
Input [][]string
|
||||
Output string
|
||||
Error error
|
||||
UseCRLF bool
|
||||
Comma rune
|
||||
}{
|
||||
{Input: [][]string{{"abc"}}, Output: "abc\n"},
|
||||
{Input: [][]string{{"abc"}}, Output: "abc\r\n", UseCRLF: true},
|
||||
{Input: [][]string{{`"abc"`}}, Output: `"""abc"""` + "\n"},
|
||||
{Input: [][]string{{`a"b`}}, Output: `"a""b"` + "\n"},
|
||||
{Input: [][]string{{`"a"b"`}}, Output: `"""a""b"""` + "\n"},
|
||||
{Input: [][]string{{" abc"}}, Output: `" abc"` + "\n"},
|
||||
{Input: [][]string{{"abc,def"}}, Output: `"abc,def"` + "\n"},
|
||||
{Input: [][]string{{"abc", "def"}}, Output: "abc,def\n"},
|
||||
{Input: [][]string{{"abc"}, {"def"}}, Output: "abc\ndef\n"},
|
||||
{Input: [][]string{{"abc\ndef"}}, Output: "\"abc\ndef\"\n"},
|
||||
{Input: [][]string{{"abc\ndef"}}, Output: "\"abc\r\ndef\"\r\n", UseCRLF: true},
|
||||
{Input: [][]string{{"abc\rdef"}}, Output: "\"abcdef\"\r\n", UseCRLF: true},
|
||||
{Input: [][]string{{"abc\rdef"}}, Output: "\"abc\rdef\"\n", UseCRLF: false},
|
||||
{Input: [][]string{{""}}, Output: "\n"},
|
||||
{Input: [][]string{{"", ""}}, Output: ",\n"},
|
||||
{Input: [][]string{{"", "", ""}}, Output: ",,\n"},
|
||||
{Input: [][]string{{"", "", "a"}}, Output: ",,a\n"},
|
||||
{Input: [][]string{{"", "a", ""}}, Output: ",a,\n"},
|
||||
{Input: [][]string{{"", "a", "a"}}, Output: ",a,a\n"},
|
||||
{Input: [][]string{{"a", "", ""}}, Output: "a,,\n"},
|
||||
{Input: [][]string{{"a", "", "a"}}, Output: "a,,a\n"},
|
||||
{Input: [][]string{{"a", "a", ""}}, Output: "a,a,\n"},
|
||||
{Input: [][]string{{"a", "a", "a"}}, Output: "a,a,a\n"},
|
||||
{Input: [][]string{{`\.`}}, Output: "\"\\.\"\n"},
|
||||
{Input: [][]string{{"x09\x41\xb4\x1c", "aktau"}}, Output: "x09\x41\xb4\x1c,aktau\n"},
|
||||
{Input: [][]string{{",x09\x41\xb4\x1c", "aktau"}}, Output: "\",x09\x41\xb4\x1c\",aktau\n"},
|
||||
{Input: [][]string{{"a", "a", ""}}, Output: "a|a|\n", Comma: '|'},
|
||||
{Input: [][]string{{",", ",", ""}}, Output: ",|,|\n", Comma: '|'},
|
||||
{Input: [][]string{{"foo"}}, Comma: '"', Error: errInvalidDelim},
|
||||
}
|
||||
|
||||
func TestWrite(t *testing.T) {
|
||||
for n, tt := range writeTests {
|
||||
b := &bytes.Buffer{}
|
||||
f := NewWriter(b)
|
||||
f.UseCRLF = tt.UseCRLF
|
||||
if tt.Comma != 0 {
|
||||
f.Comma = tt.Comma
|
||||
}
|
||||
err := f.WriteAll(tt.Input)
|
||||
if err != tt.Error {
|
||||
t.Errorf("Unexpected error:\ngot %v\nwant %v", err, tt.Error)
|
||||
}
|
||||
out := b.String()
|
||||
if out != tt.Output {
|
||||
t.Errorf("#%d: out=%q want %q", n, out, tt.Output)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type errorWriter struct{}
|
||||
|
||||
func (e errorWriter) Write(b []byte) (int, error) {
|
||||
return 0, errors.New("Test")
|
||||
}
|
||||
|
||||
func TestError(t *testing.T) {
|
||||
b := &bytes.Buffer{}
|
||||
f := NewWriter(b)
|
||||
f.Write([]string{"abc"})
|
||||
f.Flush()
|
||||
err := f.Error()
|
||||
|
||||
if err != nil {
|
||||
t.Errorf("Unexpected error: %s\n", err)
|
||||
}
|
||||
|
||||
f = NewWriter(errorWriter{})
|
||||
f.Write([]string{"abc"})
|
||||
f.Flush()
|
||||
err = f.Error()
|
||||
|
||||
if err == nil {
|
||||
t.Error("Error should not be nil")
|
||||
}
|
||||
}
|
@ -19,12 +19,12 @@ package csv
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"io"
|
||||
"runtime"
|
||||
"sync"
|
||||
|
||||
csv "github.com/minio/minio/pkg/csvparser"
|
||||
"github.com/minio/minio/pkg/s3select/sql"
|
||||
)
|
||||
|
||||
|
@ -17,13 +17,13 @@
|
||||
package csv
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
|
||||
"github.com/bcicen/jstream"
|
||||
csv "github.com/minio/minio/pkg/csvparser"
|
||||
"github.com/minio/minio/pkg/s3select/sql"
|
||||
)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user