minio/internal/s3select/csv/record.go

149 lines
4.1 KiB
Go
Raw Normal View History

// Copyright (c) 2015-2021 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package csv
import (
"encoding/json"
"errors"
"fmt"
"io"
"strconv"
"strings"
"github.com/bcicen/jstream"
2021-04-27 12:49:26 -04:00
csv "github.com/minio/csvparser"
"github.com/minio/minio/internal/s3select/sql"
)
Concurrent CSV parsing and reduce S3 select allocations (#8200) ``` CSV parsing, BEFORE: BenchmarkReaderBasic-12 2842 407533 ns/op 397860 B/op 957 allocs/op BenchmarkReaderReplace-12 2718 429914 ns/op 397844 B/op 957 allocs/op BenchmarkReaderReplaceTwo-12 2718 435556 ns/op 397855 B/op 957 allocs/op BenchmarkAggregateCount_100K-12 171 6798974 ns/op 16667102 B/op 308077 allocs/op BenchmarkAggregateCount_1M-12 19 65657411 ns/op 168057743 B/op 3146610 allocs/op BenchmarkSelectAll_10M-12 1 20882119900 ns/op 2758799896 B/op 41978762 allocs/op CSV parsing, AFTER: BenchmarkReaderBasic-12 3721 312549 ns/op 101920 B/op 338 allocs/op BenchmarkReaderReplace-12 3776 318810 ns/op 101993 B/op 340 allocs/op BenchmarkReaderReplaceTwo-12 3610 330967 ns/op 102012 B/op 341 allocs/op BenchmarkAggregateCount_100K-12 295 4149588 ns/op 3553623 B/op 103261 allocs/op BenchmarkAggregateCount_1M-12 30 37746503 ns/op 33827931 B/op 1049435 allocs/op BenchmarkSelectAll_10M-12 1 17608495800 ns/op 1416504040 B/op 21007082 allocs/op ~ benchcmp old.txt new.txt benchmark old ns/op new ns/op delta BenchmarkReaderBasic-12 407533 312549 -23.31% BenchmarkReaderReplace-12 429914 318810 -25.84% BenchmarkReaderReplaceTwo-12 435556 330967 -24.01% BenchmarkAggregateCount_100K-12 6798974 4149588 -38.97% BenchmarkAggregateCount_1M-12 65657411 37746503 -42.51% BenchmarkSelectAll_10M-12 20882119900 17608495800 -15.68% benchmark old allocs new allocs delta BenchmarkReaderBasic-12 957 338 -64.68% BenchmarkReaderReplace-12 957 340 -64.47% BenchmarkReaderReplaceTwo-12 957 341 -64.37% BenchmarkAggregateCount_100K-12 308077 103261 -66.48% BenchmarkAggregateCount_1M-12 3146610 1049435 -66.65% BenchmarkSelectAll_10M-12 41978762 21007082 -49.96% benchmark old bytes new bytes delta BenchmarkReaderBasic-12 397860 101920 -74.38% BenchmarkReaderReplace-12 397844 101993 -74.36% BenchmarkReaderReplaceTwo-12 397855 102012 -74.36% BenchmarkAggregateCount_100K-12 16667102 3553623 -78.68% BenchmarkAggregateCount_1M-12 168057743 33827931 -79.87% BenchmarkSelectAll_10M-12 2758799896 1416504040 -48.66% ``` ``` BenchmarkReaderHuge/97K-12 2200 540840 ns/op 184.32 MB/s 1604450 B/op 687 allocs/op BenchmarkReaderHuge/194K-12 1522 752257 ns/op 265.04 MB/s 2143135 B/op 1335 allocs/op BenchmarkReaderHuge/389K-12 1190 947858 ns/op 420.69 MB/s 3221831 B/op 2630 allocs/op BenchmarkReaderHuge/778K-12 806 1472486 ns/op 541.61 MB/s 5201856 B/op 5187 allocs/op BenchmarkReaderHuge/1557K-12 426 2575269 ns/op 619.36 MB/s 9101330 B/op 10233 allocs/op BenchmarkReaderHuge/3115K-12 286 4034656 ns/op 790.66 MB/s 12397968 B/op 16099 allocs/op BenchmarkReaderHuge/6230K-12 172 6830563 ns/op 934.05 MB/s 16008416 B/op 26844 allocs/op BenchmarkReaderHuge/12461K-12 100 11409467 ns/op 1118.39 MB/s 22655163 B/op 48107 allocs/op BenchmarkReaderHuge/24922K-12 66 19780395 ns/op 1290.19 MB/s 35158559 B/op 90216 allocs/op BenchmarkReaderHuge/49844K-12 34 37282559 ns/op 1369.03 MB/s 60528624 B/op 174497 allocs/op ```
2019-09-13 17:18:35 -04:00
// Record - is a CSV record.
type Record struct {
columnNames []string
csvRecord []string
nameIndexMap map[string]int64
}
// Get - gets the value for a column name. CSV fields do not have any
// defined type (other than the default string). So this function
// always returns fields using sql.FromBytes so that the type
// specified/implied by the query can be used, or can be automatically
// converted based on the query.
func (r *Record) Get(name string) (*sql.Value, error) {
index, found := r.nameIndexMap[name]
if !found {
// Check if index.
if strings.HasPrefix(name, "_") {
idx, err := strconv.Atoi(strings.TrimPrefix(name, "_"))
if err != nil {
return nil, fmt.Errorf("column %v not found", name)
}
// The position count starts at 1.
idx--
if idx >= len(r.csvRecord) || idx < 0 {
// If field index > number of columns, return null
return sql.FromNull(), nil
}
return sql.FromBytes([]byte(r.csvRecord[idx])), nil
}
// TODO: Return Missing?
return nil, fmt.Errorf("column %v not found", name)
}
if index >= int64(len(r.csvRecord)) {
// No value found for column 'name', hence return null
// value
return sql.FromNull(), nil
}
return sql.FromBytes([]byte(r.csvRecord[index])), nil
}
// Set - sets the value for a column name.
2020-02-13 17:03:52 -05:00
func (r *Record) Set(name string, value *sql.Value) (sql.Record, error) {
r.columnNames = append(r.columnNames, name)
r.csvRecord = append(r.csvRecord, value.CSVString())
2020-02-13 17:03:52 -05:00
return r, nil
}
// Reset data in record.
func (r *Record) Reset() {
if len(r.columnNames) > 0 {
r.columnNames = r.columnNames[:0]
}
if len(r.csvRecord) > 0 {
r.csvRecord = r.csvRecord[:0]
}
for k := range r.nameIndexMap {
delete(r.nameIndexMap, k)
}
}
// Clone the record.
func (r *Record) Clone(dst sql.Record) sql.Record {
other, ok := dst.(*Record)
if !ok {
other = &Record{}
}
if len(other.columnNames) > 0 {
other.columnNames = other.columnNames[:0]
}
if len(other.csvRecord) > 0 {
other.csvRecord = other.csvRecord[:0]
}
other.columnNames = append(other.columnNames, r.columnNames...)
other.csvRecord = append(other.csvRecord, r.csvRecord...)
return other
}
// WriteCSV - encodes to CSV data.
func (r *Record) WriteCSV(writer io.Writer, opts sql.WriteCSVOpts) error {
w := csv.NewWriter(writer)
w.Comma = opts.FieldDelimiter
w.AlwaysQuote = opts.AlwaysQuote
w.Quote = opts.Quote
w.QuoteEscape = opts.QuoteEscape
if err := w.Write(r.csvRecord); err != nil {
return err
}
w.Flush()
return w.Error()
}
// WriteJSON - encodes to JSON data.
func (r *Record) WriteJSON(writer io.Writer) error {
var kvs jstream.KVS = make([]jstream.KV, len(r.columnNames))
for i := 0; i < len(r.columnNames); i++ {
kvs[i] = jstream.KV{Key: r.columnNames[i], Value: r.csvRecord[i]}
}
return json.NewEncoder(writer).Encode(kvs)
}
// Raw - returns the underlying data with format info.
func (r *Record) Raw() (sql.SelectObjectFormat, interface{}) {
return sql.SelectFmtCSV, r
}
// Replace - is not supported for CSV
2020-02-13 17:03:52 -05:00
func (r *Record) Replace(_ interface{}) error {
return errors.New("Replace is not supported for CSV")
}
// NewRecord - creates new CSV record.
func NewRecord() *Record {
return &Record{}
}