mirror of
https://github.com/minio/minio.git
synced 2025-01-27 14:43:18 -05:00
7e1661f4fa
This improves the performance of certain queries dramatically, such as 'count(*)' etc. Without this PR ``` ~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz 2173762 real 0m42.464s user 0m0.071s sys 0m0.010s ``` With this PR ``` ~ time mc select --query "select count(*) from S3Object" myminio/sjm-airlines/star2000.csv.gz 2173762 real 0m17.603s user 0m0.093s sys 0m0.008s ``` Almost a 250% improvement in performance. This PR avoids a lot of type conversions and instead relies on raw sequences of data and interprets them lazily. ``` benchcmp old new benchmark old ns/op new ns/op delta BenchmarkSQLAggregate_100K-4 551213 259782 -52.87% BenchmarkSQLAggregate_1M-4 6981901985 2432413729 -65.16% BenchmarkSQLAggregate_2M-4 13511978488 4536903552 -66.42% BenchmarkSQLAggregate_10M-4 68427084908 23266283336 -66.00% benchmark old allocs new allocs delta BenchmarkSQLAggregate_100K-4 2366 485 -79.50% BenchmarkSQLAggregate_1M-4 47455492 21462860 -54.77% BenchmarkSQLAggregate_2M-4 95163637 43110771 -54.70% BenchmarkSQLAggregate_10M-4 476959550 216906510 -54.52% benchmark old bytes new bytes delta BenchmarkSQLAggregate_100K-4 1233079 1086024 -11.93% BenchmarkSQLAggregate_1M-4 2607984120 557038536 -78.64% BenchmarkSQLAggregate_2M-4 5254103616 1128149168 -78.53% BenchmarkSQLAggregate_10M-4 26443524872 5722715992 -78.36% ```
316 lines
8.9 KiB
Go
316 lines
8.9 KiB
Go
/*
|
|
* Minio Cloud Storage, (C) 2018 Minio, Inc.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package csv
|
|
|
|
import (
|
|
"encoding/csv"
|
|
"encoding/xml"
|
|
"io"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/tidwall/sjson"
|
|
|
|
"github.com/minio/minio/pkg/ioutil"
|
|
"github.com/minio/minio/pkg/s3select/format"
|
|
)
|
|
|
|
// Options options are passed to the underlying encoding/csv reader.
|
|
type Options struct {
|
|
// HasHeader when true, will treat the first row as a header row.
|
|
HasHeader bool
|
|
|
|
// RecordDelimiter is the string that records are delimited by.
|
|
RecordDelimiter string
|
|
|
|
// FieldDelimiter is the string that fields are delimited by.
|
|
FieldDelimiter string
|
|
|
|
// Comments is the string the first character of a line of
|
|
// text matches the comment character.
|
|
Comments string
|
|
|
|
// Name of the table that is used for querying
|
|
Name string
|
|
|
|
// ReadFrom is where the data will be read from.
|
|
ReadFrom io.Reader
|
|
|
|
// If true then we need to add gzip or bzip reader.
|
|
// to extract the csv.
|
|
Compressed string
|
|
|
|
// SQL expression meant to be evaluated.
|
|
Expression string
|
|
|
|
// What the outputted CSV will be delimited by .
|
|
OutputFieldDelimiter string
|
|
|
|
// Size of incoming object
|
|
StreamSize int64
|
|
|
|
// Whether Header is "USE" or another
|
|
HeaderOpt bool
|
|
|
|
// Progress enabled, enable/disable progress messages.
|
|
Progress bool
|
|
}
|
|
|
|
// cinput represents a record producing input from a formatted object.
|
|
type cinput struct {
|
|
options *Options
|
|
reader *csv.Reader
|
|
firstRow []string
|
|
header []string
|
|
minOutputLength int
|
|
stats struct {
|
|
BytesScanned int64
|
|
BytesReturned int64
|
|
BytesProcessed int64
|
|
}
|
|
}
|
|
|
|
// New sets up a new Input, the first row is read when this is run.
|
|
// If there is a problem with reading the first row, the error is returned.
|
|
// Otherwise, the returned reader can be reliably consumed with Read().
|
|
// until Read() return err.
|
|
func New(opts *Options) (format.Select, error) {
|
|
// DelimitedReader treats custom record delimiter like `\r\n`,`\r`,`ab` etc and replaces it with `\n`.
|
|
normalizedReader := ioutil.NewDelimitedReader(opts.ReadFrom, []rune(opts.RecordDelimiter))
|
|
reader := &cinput{
|
|
options: opts,
|
|
reader: csv.NewReader(normalizedReader),
|
|
}
|
|
reader.stats.BytesScanned = opts.StreamSize
|
|
reader.stats.BytesProcessed = 0
|
|
reader.stats.BytesReturned = 0
|
|
reader.firstRow = nil
|
|
|
|
reader.reader.FieldsPerRecord = -1
|
|
if reader.options.FieldDelimiter != "" {
|
|
reader.reader.Comma = rune(reader.options.FieldDelimiter[0])
|
|
}
|
|
|
|
if reader.options.Comments != "" {
|
|
reader.reader.Comment = rune(reader.options.Comments[0])
|
|
}
|
|
|
|
// QuoteCharacter - " (defaulted currently)
|
|
reader.reader.LazyQuotes = true
|
|
|
|
if err := reader.readHeader(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return reader, nil
|
|
}
|
|
|
|
// Replace the spaces in columnnames with underscores
|
|
func cleanHeader(columns []string) []string {
|
|
for i := range columns {
|
|
// Even if header name is specified, some CSV's
|
|
// might have column header names might be empty
|
|
// and non-empty. In such a scenario we prepare
|
|
// indexed value.
|
|
if columns[i] == "" {
|
|
columns[i] = "_" + strconv.Itoa(i)
|
|
}
|
|
columns[i] = strings.Replace(columns[i], " ", "_", -1)
|
|
}
|
|
return columns
|
|
}
|
|
|
|
// readHeader reads the header into the header variable if the header is present
|
|
// as the first row of the csv
|
|
func (reader *cinput) readHeader() error {
|
|
var readErr error
|
|
if reader.options.HasHeader {
|
|
reader.firstRow, readErr = reader.reader.Read()
|
|
if readErr != nil {
|
|
return format.ErrCSVParsingError
|
|
}
|
|
reader.header = cleanHeader(reader.firstRow)
|
|
reader.firstRow = nil
|
|
} else {
|
|
reader.firstRow, readErr = reader.reader.Read()
|
|
reader.header = make([]string, len(reader.firstRow))
|
|
for i := range reader.firstRow {
|
|
reader.header[i] = "_" + strconv.Itoa(i)
|
|
}
|
|
}
|
|
reader.minOutputLength = len(reader.header)
|
|
return nil
|
|
}
|
|
|
|
// Progress - return true if progress was requested.
|
|
func (reader *cinput) Progress() bool {
|
|
return reader.options.Progress
|
|
}
|
|
|
|
// UpdateBytesProcessed - populates the bytes Processed
|
|
func (reader *cinput) UpdateBytesProcessed(size int64) {
|
|
reader.stats.BytesProcessed += size
|
|
|
|
}
|
|
|
|
// Read returns byte sequence
|
|
func (reader *cinput) Read() ([]byte, error) {
|
|
dec := reader.readRecord()
|
|
if dec != nil {
|
|
var data []byte
|
|
var err error
|
|
for i, value := range dec {
|
|
data, err = sjson.SetBytes(data, reader.header[i], value)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
return data, nil
|
|
}
|
|
return nil, nil
|
|
}
|
|
|
|
// OutputFieldDelimiter - returns the delimiter specified in input request
|
|
func (reader *cinput) OutputFieldDelimiter() string {
|
|
return reader.options.OutputFieldDelimiter
|
|
}
|
|
|
|
// HasHeader - returns true or false depending upon the header.
|
|
func (reader *cinput) HasHeader() bool {
|
|
return reader.options.HasHeader
|
|
}
|
|
|
|
// Expression - return the Select Expression for
|
|
func (reader *cinput) Expression() string {
|
|
return reader.options.Expression
|
|
}
|
|
|
|
// UpdateBytesReturned - updates the Bytes returned for
|
|
func (reader *cinput) UpdateBytesReturned(size int64) {
|
|
reader.stats.BytesReturned += size
|
|
}
|
|
|
|
// Header returns the header of the reader. Either the first row if a header
|
|
// set in the options, or c#, where # is the column number, starting with 0.
|
|
func (reader *cinput) Header() []string {
|
|
return reader.header
|
|
}
|
|
|
|
// readRecord reads a single record from the stream and it always returns successfully.
|
|
// If the record is empty, an empty []string is returned.
|
|
// Record expand to match the current row size, adding blank fields as needed.
|
|
// Records never return less then the number of fields in the first row.
|
|
// Returns nil on EOF
|
|
// In the event of a parse error due to an invalid record, it is logged, and
|
|
// an empty []string is returned with the number of fields in the first row,
|
|
// as if the record were empty.
|
|
//
|
|
// In general, this is a very tolerant of problems reader.
|
|
func (reader *cinput) readRecord() []string {
|
|
var row []string
|
|
var fileErr error
|
|
|
|
if reader.firstRow != nil {
|
|
row = reader.firstRow
|
|
reader.firstRow = nil
|
|
return row
|
|
}
|
|
|
|
row, fileErr = reader.reader.Read()
|
|
emptysToAppend := reader.minOutputLength - len(row)
|
|
if fileErr == io.EOF || fileErr == io.ErrClosedPipe {
|
|
return nil
|
|
} else if _, ok := fileErr.(*csv.ParseError); ok {
|
|
emptysToAppend = reader.minOutputLength
|
|
}
|
|
|
|
if emptysToAppend > 0 {
|
|
for counter := 0; counter < emptysToAppend; counter++ {
|
|
row = append(row, "")
|
|
}
|
|
}
|
|
|
|
return row
|
|
}
|
|
|
|
// CreateStatXML is the function which does the marshaling from the stat
|
|
// structs into XML so that the progress and stat message can be sent
|
|
func (reader *cinput) CreateStatXML() (string, error) {
|
|
if reader.options.Compressed == "NONE" {
|
|
reader.stats.BytesProcessed = reader.options.StreamSize
|
|
reader.stats.BytesScanned = reader.stats.BytesProcessed
|
|
}
|
|
out, err := xml.Marshal(&format.Stats{
|
|
BytesScanned: reader.stats.BytesScanned,
|
|
BytesProcessed: reader.stats.BytesProcessed,
|
|
BytesReturned: reader.stats.BytesReturned,
|
|
})
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return xml.Header + string(out), nil
|
|
}
|
|
|
|
// CreateProgressXML is the function which does the marshaling from the progress
|
|
// structs into XML so that the progress and stat message can be sent
|
|
func (reader *cinput) CreateProgressXML() (string, error) {
|
|
if reader.options.HasHeader {
|
|
reader.stats.BytesProcessed += format.ProcessSize(reader.header)
|
|
}
|
|
if reader.options.Compressed == "NONE" {
|
|
reader.stats.BytesScanned = reader.stats.BytesProcessed
|
|
}
|
|
out, err := xml.Marshal(&format.Progress{
|
|
BytesScanned: reader.stats.BytesScanned,
|
|
BytesProcessed: reader.stats.BytesProcessed,
|
|
BytesReturned: reader.stats.BytesReturned,
|
|
})
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
return xml.Header + string(out), nil
|
|
}
|
|
|
|
// Type - return the data format type {
|
|
func (reader *cinput) Type() format.Type {
|
|
return format.CSV
|
|
}
|
|
|
|
// ColNameErrs is a function which makes sure that the headers are requested are
|
|
// present in the file otherwise it throws an error.
|
|
func (reader *cinput) ColNameErrs(columnNames []string) error {
|
|
for i := 0; i < len(columnNames); i++ {
|
|
if columnNames[i] == "" {
|
|
continue
|
|
}
|
|
if !format.IsInt(columnNames[i]) && !reader.options.HeaderOpt {
|
|
return format.ErrInvalidColumnIndex
|
|
}
|
|
if format.IsInt(columnNames[i]) {
|
|
tempInt, _ := strconv.Atoi(columnNames[i])
|
|
if tempInt > len(reader.Header()) || tempInt == 0 {
|
|
return format.ErrInvalidColumnIndex
|
|
}
|
|
} else {
|
|
if reader.options.HeaderOpt && !format.StringInSlice(columnNames[i], reader.Header()) {
|
|
return format.ErrParseInvalidPathComponent
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
}
|