mirror of
https://github.com/minio/minio.git
synced 2025-11-23 19:17:43 -05:00
SQL select query for CSV/JSON (#6648)
select * , select column names have been implemented for CSV. select * is implemented for JSON.
This commit is contained in:
committed by
kannappanr
parent
acf46cc3b5
commit
c0b4bf0a3e
@@ -18,18 +18,14 @@ package s3select
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"compress/bzip2"
|
||||
"encoding/csv"
|
||||
"encoding/xml"
|
||||
"io"
|
||||
"strconv"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"net/http"
|
||||
|
||||
gzip "github.com/klauspost/pgzip"
|
||||
"github.com/minio/minio/pkg/ioutil"
|
||||
"github.com/minio/minio/pkg/s3select/format"
|
||||
"github.com/minio/minio/pkg/s3select/format/csv"
|
||||
"github.com/minio/minio/pkg/s3select/format/json"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -40,245 +36,16 @@ const (
|
||||
continuationTime time.Duration = 5 * time.Second
|
||||
)
|
||||
|
||||
// progress represents a struct that represents the format for XML of the
|
||||
// progress messages
|
||||
type progress struct {
|
||||
XMLName xml.Name `xml:"Progress" json:"-"`
|
||||
BytesScanned int64 `xml:"BytesScanned"`
|
||||
BytesProcessed int64 `xml:"BytesProcessed"`
|
||||
BytesReturned int64 `xml:"BytesReturned"`
|
||||
}
|
||||
|
||||
// stats represents a struct that represents the format for XML of the stat
|
||||
// messages
|
||||
type stats struct {
|
||||
XMLName xml.Name `xml:"Stats" json:"-"`
|
||||
BytesScanned int64 `xml:"BytesScanned"`
|
||||
BytesProcessed int64 `xml:"BytesProcessed"`
|
||||
BytesReturned int64 `xml:"BytesReturned"`
|
||||
}
|
||||
|
||||
// StatInfo is a struct that represents the
|
||||
type statInfo struct {
|
||||
BytesScanned int64
|
||||
BytesReturned int64
|
||||
BytesProcessed int64
|
||||
}
|
||||
|
||||
// Input represents a record producing input from a formatted file or pipe.
|
||||
type Input struct {
|
||||
options *Options
|
||||
reader *csv.Reader
|
||||
firstRow []string
|
||||
header []string
|
||||
minOutputLength int
|
||||
stats *statInfo
|
||||
}
|
||||
|
||||
// Options options are passed to the underlying encoding/csv reader.
|
||||
type Options struct {
|
||||
// HasHeader when true, will treat the first row as a header row.
|
||||
HasHeader bool
|
||||
|
||||
// RecordDelimiter is the string that records are delimited by.
|
||||
RecordDelimiter string
|
||||
|
||||
// FieldDelimiter is the string that fields are delimited by.
|
||||
FieldDelimiter string
|
||||
|
||||
// Comments is the string the first character of a line of
|
||||
// text matches the comment character.
|
||||
Comments string
|
||||
|
||||
// Name of the table that is used for querying
|
||||
Name string
|
||||
|
||||
// ReadFrom is where the data will be read from.
|
||||
ReadFrom io.Reader
|
||||
|
||||
// If true then we need to add gzip or bzip reader.
|
||||
// to extract the csv.
|
||||
Compressed string
|
||||
|
||||
// SQL expression meant to be evaluated.
|
||||
Expression string
|
||||
|
||||
// What the outputted CSV will be delimited by .
|
||||
OutputFieldDelimiter string
|
||||
|
||||
// Size of incoming object
|
||||
StreamSize int64
|
||||
|
||||
// Whether Header is "USE" or another
|
||||
HeaderOpt bool
|
||||
|
||||
// Progress enabled, enable/disable progress messages.
|
||||
Progress bool
|
||||
}
|
||||
|
||||
// NewInput sets up a new Input, the first row is read when this is run.
|
||||
// If there is a problem with reading the first row, the error is returned.
|
||||
// Otherwise, the returned reader can be reliably consumed with ReadRecord()
|
||||
// until ReadRecord() returns nil.
|
||||
func NewInput(opts *Options) (*Input, error) {
|
||||
myReader := opts.ReadFrom
|
||||
var tempBytesScanned int64
|
||||
tempBytesScanned = 0
|
||||
switch opts.Compressed {
|
||||
case "GZIP":
|
||||
tempBytesScanned = opts.StreamSize
|
||||
var err error
|
||||
if myReader, err = gzip.NewReader(opts.ReadFrom); err != nil {
|
||||
return nil, ErrTruncatedInput
|
||||
}
|
||||
case "BZIP2":
|
||||
tempBytesScanned = opts.StreamSize
|
||||
myReader = bzip2.NewReader(opts.ReadFrom)
|
||||
}
|
||||
|
||||
// DelimitedReader treats custom record delimiter like `\r\n`,`\r`,`ab` etc and replaces it with `\n`.
|
||||
normalizedReader := ioutil.NewDelimitedReader(myReader, []rune(opts.RecordDelimiter))
|
||||
progress := &statInfo{
|
||||
BytesScanned: tempBytesScanned,
|
||||
BytesProcessed: 0,
|
||||
BytesReturned: 0,
|
||||
}
|
||||
|
||||
reader := &Input{
|
||||
options: opts,
|
||||
reader: csv.NewReader(normalizedReader),
|
||||
stats: progress,
|
||||
}
|
||||
reader.firstRow = nil
|
||||
|
||||
reader.reader.FieldsPerRecord = -1
|
||||
if reader.options.FieldDelimiter != "" {
|
||||
reader.reader.Comma = rune(reader.options.FieldDelimiter[0])
|
||||
}
|
||||
|
||||
if reader.options.Comments != "" {
|
||||
reader.reader.Comment = rune(reader.options.Comments[0])
|
||||
}
|
||||
|
||||
// QuoteCharacter - " (defaulted currently)
|
||||
reader.reader.LazyQuotes = true
|
||||
|
||||
if err := reader.readHeader(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return reader, nil
|
||||
}
|
||||
|
||||
// ReadRecord reads a single record from the . Always returns successfully.
|
||||
// If the record is empty, an empty []string is returned.
|
||||
// Record expand to match the current row size, adding blank fields as needed.
|
||||
// Records never return less then the number of fields in the first row.
|
||||
// Returns nil on EOF
|
||||
// In the event of a parse error due to an invalid record, it is logged, and
|
||||
// an empty []string is returned with the number of fields in the first row,
|
||||
// as if the record were empty.
|
||||
//
|
||||
// In general, this is a very tolerant of problems reader.
|
||||
func (reader *Input) ReadRecord() []string {
|
||||
var row []string
|
||||
var fileErr error
|
||||
|
||||
if reader.firstRow != nil {
|
||||
row = reader.firstRow
|
||||
reader.firstRow = nil
|
||||
return row
|
||||
}
|
||||
|
||||
row, fileErr = reader.reader.Read()
|
||||
emptysToAppend := reader.minOutputLength - len(row)
|
||||
if fileErr == io.EOF || fileErr == io.ErrClosedPipe {
|
||||
return nil
|
||||
} else if _, ok := fileErr.(*csv.ParseError); ok {
|
||||
emptysToAppend = reader.minOutputLength
|
||||
}
|
||||
|
||||
if emptysToAppend > 0 {
|
||||
for counter := 0; counter < emptysToAppend; counter++ {
|
||||
row = append(row, "")
|
||||
}
|
||||
}
|
||||
|
||||
return row
|
||||
}
|
||||
|
||||
// readHeader reads the header into the header variable if the header is present
|
||||
// as the first row of the csv
|
||||
func (reader *Input) readHeader() error {
|
||||
var readErr error
|
||||
if reader.options.HasHeader {
|
||||
reader.firstRow, readErr = reader.reader.Read()
|
||||
if readErr != nil {
|
||||
return ErrCSVParsingError
|
||||
}
|
||||
reader.header = cleanHeader(reader.firstRow)
|
||||
reader.firstRow = nil
|
||||
reader.minOutputLength = len(reader.header)
|
||||
} else {
|
||||
reader.firstRow, readErr = reader.reader.Read()
|
||||
reader.header = make([]string, len(reader.firstRow))
|
||||
for i := 0; i < reader.minOutputLength; i++ {
|
||||
reader.header[i] = strconv.Itoa(i)
|
||||
}
|
||||
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Replace the spaces in columnnames with underscores
|
||||
func cleanHeader(columns []string) []string {
|
||||
for i := 0; i < len(columns); i++ {
|
||||
columns[i] = strings.Replace(columns[i], " ", "_", -1)
|
||||
}
|
||||
return columns
|
||||
}
|
||||
|
||||
// createStatXML is the function which does the marshaling from the stat
|
||||
// structs into XML so that the progress and stat message can be sent
|
||||
func (reader *Input) createStatXML() (string, error) {
|
||||
if reader.options.Compressed == "NONE" {
|
||||
reader.stats.BytesProcessed = reader.options.StreamSize
|
||||
reader.stats.BytesScanned = reader.stats.BytesProcessed
|
||||
}
|
||||
out, err := xml.Marshal(&stats{
|
||||
BytesScanned: reader.stats.BytesScanned,
|
||||
BytesProcessed: reader.stats.BytesProcessed,
|
||||
BytesReturned: reader.stats.BytesReturned,
|
||||
})
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return xml.Header + string(out), nil
|
||||
}
|
||||
|
||||
// createProgressXML is the function which does the marshaling from the progress structs into XML so that the progress and stat message can be sent
|
||||
func (reader *Input) createProgressXML() (string, error) {
|
||||
if reader.options.HasHeader {
|
||||
reader.stats.BytesProcessed += processSize(reader.header)
|
||||
}
|
||||
if reader.options.Compressed == "NONE" {
|
||||
reader.stats.BytesScanned = reader.stats.BytesProcessed
|
||||
}
|
||||
out, err := xml.Marshal(&progress{
|
||||
BytesScanned: reader.stats.BytesScanned,
|
||||
BytesProcessed: reader.stats.BytesProcessed,
|
||||
BytesReturned: reader.stats.BytesReturned,
|
||||
})
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return xml.Header + string(out), nil
|
||||
}
|
||||
|
||||
// Header returns the header of the reader. Either the first row if a header
|
||||
// set in the options, or c#, where # is the column number, starting with 0.
|
||||
func (reader *Input) Header() []string {
|
||||
return reader.header
|
||||
// ParseSelectTokens tokenizes the select query into required Columns, Alias, limit value
|
||||
// where clause, aggregate functions, myFunctions, error.
|
||||
type ParseSelectTokens struct {
|
||||
reqCols []string
|
||||
alias string
|
||||
myLimit int64
|
||||
whereClause interface{}
|
||||
aggFunctionNames []string
|
||||
myFuncs *SelectFuncs
|
||||
myErr error
|
||||
}
|
||||
|
||||
// Row is a Struct for keeping track of key aspects of a row.
|
||||
@@ -287,10 +54,58 @@ type Row struct {
|
||||
err error
|
||||
}
|
||||
|
||||
// This function replaces "",'' with `` for the select parser
|
||||
func cleanExpr(expr string) string {
|
||||
r := strings.NewReplacer("\"", "`", "'", "`")
|
||||
return r.Replace(expr)
|
||||
}
|
||||
|
||||
// New - initialize new select format
|
||||
func New(gr io.Reader, size int64, req ObjectSelectRequest) (s3s format.Select, err error) {
|
||||
// Initializating options for CSV
|
||||
if req.InputSerialization.CSV != nil {
|
||||
if req.OutputSerialization.CSV.FieldDelimiter == "" {
|
||||
req.OutputSerialization.CSV.FieldDelimiter = ","
|
||||
}
|
||||
if req.InputSerialization.CSV.FileHeaderInfo == "" {
|
||||
req.InputSerialization.CSV.FileHeaderInfo = CSVFileHeaderInfoNone
|
||||
}
|
||||
if req.InputSerialization.CSV.RecordDelimiter == "" {
|
||||
req.InputSerialization.CSV.RecordDelimiter = "\n"
|
||||
}
|
||||
s3s, err = csv.New(&csv.Options{
|
||||
HasHeader: req.InputSerialization.CSV.FileHeaderInfo != CSVFileHeaderInfoNone,
|
||||
RecordDelimiter: req.InputSerialization.CSV.RecordDelimiter,
|
||||
FieldDelimiter: req.InputSerialization.CSV.FieldDelimiter,
|
||||
Comments: req.InputSerialization.CSV.Comments,
|
||||
Name: "S3Object", // Default table name for all objects
|
||||
ReadFrom: gr,
|
||||
Compressed: string(req.InputSerialization.CompressionType),
|
||||
Expression: cleanExpr(req.Expression),
|
||||
OutputFieldDelimiter: req.OutputSerialization.CSV.FieldDelimiter,
|
||||
StreamSize: size,
|
||||
HeaderOpt: req.InputSerialization.CSV.FileHeaderInfo == CSVFileHeaderInfoUse,
|
||||
Progress: req.RequestProgress.Enabled,
|
||||
})
|
||||
} else if req.InputSerialization.JSON != nil {
|
||||
// Initializating options for JSON
|
||||
s3s, err = json.New(&json.Options{
|
||||
Name: "S3Object", // Default table name for all objects
|
||||
ReadFrom: gr,
|
||||
Compressed: string(req.InputSerialization.CompressionType),
|
||||
Expression: cleanExpr(req.Expression),
|
||||
StreamSize: size,
|
||||
Type: req.InputSerialization.JSON.Type == JSONTypeDocument,
|
||||
Progress: req.RequestProgress.Enabled,
|
||||
})
|
||||
}
|
||||
return s3s, err
|
||||
}
|
||||
|
||||
// Execute is the function where all the blocking occurs, It writes to the HTTP
|
||||
// response writer in a streaming fashion so that the client can actively use
|
||||
// the results before the query is finally finished executing. The
|
||||
func (reader *Input) Execute(writer io.Writer) error {
|
||||
func Execute(writer io.Writer, f format.Select) error {
|
||||
myRow := make(chan *Row)
|
||||
curBuf := bytes.NewBuffer(make([]byte, 1000000))
|
||||
curBuf.Reset()
|
||||
@@ -298,12 +113,14 @@ func (reader *Input) Execute(writer io.Writer) error {
|
||||
continuationTimer := time.NewTimer(continuationTime)
|
||||
defer progressTicker.Stop()
|
||||
defer continuationTimer.Stop()
|
||||
go reader.runSelectParser(reader.options.Expression, myRow)
|
||||
|
||||
go runSelectParser(f, myRow)
|
||||
|
||||
for {
|
||||
select {
|
||||
case row, ok := <-myRow:
|
||||
if ok && row.err != nil {
|
||||
errorMessage := reader.writeErrorMessage(row.err, curBuf)
|
||||
errorMessage := writeErrorMessage(row.err, curBuf)
|
||||
_, err := errorMessage.WriteTo(writer)
|
||||
flusher, okFlush := writer.(http.Flusher)
|
||||
if okFlush {
|
||||
@@ -316,7 +133,7 @@ func (reader *Input) Execute(writer io.Writer) error {
|
||||
close(myRow)
|
||||
return nil
|
||||
} else if ok {
|
||||
message := reader.writeRecordMessage(row.record, curBuf)
|
||||
message := writeRecordMessage(row.record, curBuf)
|
||||
_, err := message.WriteTo(writer)
|
||||
flusher, okFlush := writer.(http.Flusher)
|
||||
if okFlush {
|
||||
@@ -326,17 +143,17 @@ func (reader *Input) Execute(writer io.Writer) error {
|
||||
return err
|
||||
}
|
||||
curBuf.Reset()
|
||||
reader.stats.BytesReturned += int64(len(row.record))
|
||||
f.UpdateBytesReturned(int64(len(row.record)))
|
||||
if !continuationTimer.Stop() {
|
||||
<-continuationTimer.C
|
||||
}
|
||||
continuationTimer.Reset(continuationTime)
|
||||
} else if !ok {
|
||||
statPayload, err := reader.createStatXML()
|
||||
statPayload, err := f.CreateStatXML()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
statMessage := reader.writeStatMessage(statPayload, curBuf)
|
||||
statMessage := writeStatMessage(statPayload, curBuf)
|
||||
_, err = statMessage.WriteTo(writer)
|
||||
flusher, ok := writer.(http.Flusher)
|
||||
if ok {
|
||||
@@ -346,7 +163,7 @@ func (reader *Input) Execute(writer io.Writer) error {
|
||||
return err
|
||||
}
|
||||
curBuf.Reset()
|
||||
message := reader.writeEndMessage(curBuf)
|
||||
message := writeEndMessage(curBuf)
|
||||
_, err = message.WriteTo(writer)
|
||||
flusher, ok = writer.(http.Flusher)
|
||||
if ok {
|
||||
@@ -360,12 +177,12 @@ func (reader *Input) Execute(writer io.Writer) error {
|
||||
|
||||
case <-progressTicker.C:
|
||||
// Send progress messages only if requested by client.
|
||||
if reader.options.Progress {
|
||||
progressPayload, err := reader.createProgressXML()
|
||||
if f.Progress() {
|
||||
progressPayload, err := f.CreateProgressXML()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
progressMessage := reader.writeProgressMessage(progressPayload, curBuf)
|
||||
progressMessage := writeProgressMessage(progressPayload, curBuf)
|
||||
_, err = progressMessage.WriteTo(writer)
|
||||
flusher, ok := writer.(http.Flusher)
|
||||
if ok {
|
||||
@@ -377,7 +194,7 @@ func (reader *Input) Execute(writer io.Writer) error {
|
||||
curBuf.Reset()
|
||||
}
|
||||
case <-continuationTimer.C:
|
||||
message := reader.writeContinuationMessage(curBuf)
|
||||
message := writeContinuationMessage(curBuf)
|
||||
_, err := message.WriteTo(writer)
|
||||
flusher, ok := writer.(http.Flusher)
|
||||
if ok {
|
||||
|
||||
Reference in New Issue
Block a user