speed up the performance of s3select on csv (#7945)

This commit is contained in:
Yao Zongyou
2019-08-31 15:07:40 +08:00
committed by Harshavardhana
parent fa3546bb03
commit ec9bfd3aef
8 changed files with 128 additions and 77 deletions

View File

@@ -82,10 +82,11 @@ func (rr *recordReader) Read(p []byte) (n int, err error) {
// Reader - CSV record reader for S3Select.
type Reader struct {
args *ReaderArgs
readCloser io.ReadCloser
csvReader *csv.Reader
columnNames []string
args *ReaderArgs
readCloser io.ReadCloser
csvReader *csv.Reader
columnNames []string
nameIndexMap map[string]int64
}
// Read - reads single record.
@@ -99,23 +100,24 @@ func (r *Reader) Read() (sql.Record, error) {
return nil, err
}
columnNames := r.columnNames
if columnNames == nil {
columnNames = make([]string, len(csvRecord))
if r.columnNames == nil {
r.columnNames = make([]string, len(csvRecord))
for i := range csvRecord {
columnNames[i] = fmt.Sprintf("_%v", i+1)
r.columnNames[i] = fmt.Sprintf("_%v", i+1)
}
}
nameIndexMap := make(map[string]int64)
for i := range columnNames {
nameIndexMap[columnNames[i]] = int64(i)
if r.nameIndexMap == nil {
r.nameIndexMap = make(map[string]int64)
for i := range r.columnNames {
r.nameIndexMap[r.columnNames[i]] = int64(i)
}
}
return &Record{
columnNames: columnNames,
columnNames: r.columnNames,
csvRecord: csvRecord,
nameIndexMap: nameIndexMap,
nameIndexMap: r.nameIndexMap,
}, nil
}

View File

@@ -17,6 +17,7 @@
package csv
import (
"bytes"
"io"
"io/ioutil"
"strings"
@@ -39,6 +40,7 @@ func TestRead(t *testing.T) {
for i, c := range cases {
var err error
var record sql.Record
var result bytes.Buffer
r, _ := NewReader(ioutil.NopCloser(strings.NewReader(c.content)), &ReaderArgs{
FileHeaderInfo: none,
@@ -51,22 +53,22 @@ func TestRead(t *testing.T) {
unmarshaled: true,
})
result := ""
for {
record, err = r.Read()
if err != nil {
break
}
s, _ := record.MarshalCSV([]rune(c.fieldDelimiter)[0])
result += string(s) + c.recordDelimiter
record.WriteCSV(&result, []rune(c.fieldDelimiter)[0])
result.Truncate(result.Len() - 1)
result.WriteString(c.recordDelimiter)
}
r.Close()
if err != io.EOF {
t.Fatalf("Case %d failed with %s", i, err)
}
if result != c.content {
t.Errorf("Case %d failed: expected %v result %v", i, c.content, result)
if result.String() != c.content {
t.Errorf("Case %d failed: expected %v result %v", i, c.content, result.String())
}
}
}

View File

@@ -17,11 +17,11 @@
package csv
import (
"bytes"
"encoding/csv"
"encoding/json"
"errors"
"fmt"
"io"
"github.com/bcicen/jstream"
"github.com/minio/minio/pkg/s3select/sql"
@@ -61,30 +61,28 @@ func (r *Record) Set(name string, value *sql.Value) error {
return nil
}
// MarshalCSV - encodes to CSV data.
func (r *Record) MarshalCSV(fieldDelimiter rune) ([]byte, error) {
buf := new(bytes.Buffer)
w := csv.NewWriter(buf)
// WriteCSV - encodes to CSV data.
func (r *Record) WriteCSV(writer io.Writer, fieldDelimiter rune) error {
w := csv.NewWriter(writer)
w.Comma = fieldDelimiter
if err := w.Write(r.csvRecord); err != nil {
return nil, err
return err
}
w.Flush()
if err := w.Error(); err != nil {
return nil, err
return err
}
data := buf.Bytes()
return data[:len(data)-1], nil
return nil
}
// MarshalJSON - encodes to JSON data.
func (r *Record) MarshalJSON() ([]byte, error) {
// WriteJSON - encodes to JSON data.
func (r *Record) WriteJSON(writer io.Writer) error {
var kvs jstream.KVS = make([]jstream.KV, len(r.columnNames))
for i := 0; i < len(r.columnNames); i++ {
kvs[i] = jstream.KV{Key: r.columnNames[i], Value: r.csvRecord[i]}
}
return json.Marshal(kvs)
return json.NewEncoder(writer).Encode(kvs)
}
// Raw - returns the underlying data with format info.