// Copyright (c) 2015-2021 MinIO, Inc. // // This file is part of MinIO Object Storage stack // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . package s3select import ( "bufio" "bytes" "compress/bzip2" "encoding/xml" "errors" "fmt" "io" "io/ioutil" "net/http" "os" "strings" "sync" "github.com/klauspost/compress/s2" "github.com/klauspost/compress/zstd" gzip "github.com/klauspost/pgzip" "github.com/minio/minio/internal/s3select/csv" "github.com/minio/minio/internal/s3select/json" "github.com/minio/minio/internal/s3select/parquet" "github.com/minio/minio/internal/s3select/simdj" "github.com/minio/minio/internal/s3select/sql" "github.com/minio/simdjson-go" "github.com/pierrec/lz4" ) type recordReader interface { // Read a record. // dst is optional but will be used if valid. Read(dst sql.Record) (sql.Record, error) Close() error } const ( csvFormat = "csv" jsonFormat = "json" parquetFormat = "parquet" ) // CompressionType - represents value inside in request XML. type CompressionType string const ( noneType CompressionType = "none" gzipType CompressionType = "GZIP" bzip2Type CompressionType = "BZIP2" zstdType CompressionType = "ZSTD" lz4Type CompressionType = "LZ4" s2Type CompressionType = "S2" snappyType CompressionType = "SNAPPY" ) const ( maxRecordSize = 1 << 20 // 1 MiB ) var bufPool = sync.Pool{ New: func() interface{} { // make a buffer with a reasonable capacity. return bytes.NewBuffer(make([]byte, 0, maxRecordSize)) }, } var bufioWriterPool = sync.Pool{ New: func() interface{} { // ioutil.Discard is just used to create the writer. Actual destination // writer is set later by Reset() before using it. return bufio.NewWriter(ioutil.Discard) }, } // UnmarshalXML - decodes XML data. func (c *CompressionType) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { var s string if err := d.DecodeElement(&s, &start); err != nil { return errMalformedXML(err) } parsedType := CompressionType(strings.ToUpper(s)) if s == "" || parsedType == "NONE" { parsedType = noneType } switch parsedType { case noneType, gzipType, bzip2Type, snappyType, s2Type, zstdType, lz4Type: default: return errInvalidCompressionFormat(fmt.Errorf("invalid compression format '%v'", s)) } *c = parsedType return nil } // InputSerialization - represents elements inside in request XML. type InputSerialization struct { CompressionType CompressionType `xml:"CompressionType"` CSVArgs csv.ReaderArgs `xml:"CSV"` JSONArgs json.ReaderArgs `xml:"JSON"` ParquetArgs parquet.ReaderArgs `xml:"Parquet"` unmarshaled bool format string } // IsEmpty - returns whether input serialization is empty or not. func (input *InputSerialization) IsEmpty() bool { return !input.unmarshaled } // UnmarshalXML - decodes XML data. func (input *InputSerialization) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { // Make subtype to avoid recursive UnmarshalXML(). type subInputSerialization InputSerialization parsedInput := subInputSerialization{} if err := d.DecodeElement(&parsedInput, &start); err != nil { return errMalformedXML(err) } // If no compression is specified, set to noneType if parsedInput.CompressionType == "" { parsedInput.CompressionType = noneType } found := 0 if !parsedInput.CSVArgs.IsEmpty() { parsedInput.format = csvFormat found++ } if !parsedInput.JSONArgs.IsEmpty() { parsedInput.format = jsonFormat found++ } if !parsedInput.ParquetArgs.IsEmpty() { if parsedInput.CompressionType != "" && parsedInput.CompressionType != noneType { return errInvalidRequestParameter(fmt.Errorf("CompressionType must be NONE for Parquet format")) } parsedInput.format = parquetFormat found++ } if found != 1 { return errInvalidDataSource(nil) } *input = InputSerialization(parsedInput) input.unmarshaled = true return nil } // OutputSerialization - represents elements inside in request XML. type OutputSerialization struct { CSVArgs csv.WriterArgs `xml:"CSV"` JSONArgs json.WriterArgs `xml:"JSON"` unmarshaled bool format string } // IsEmpty - returns whether output serialization is empty or not. func (output *OutputSerialization) IsEmpty() bool { return !output.unmarshaled } // UnmarshalXML - decodes XML data. func (output *OutputSerialization) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { // Make subtype to avoid recursive UnmarshalXML(). type subOutputSerialization OutputSerialization parsedOutput := subOutputSerialization{} if err := d.DecodeElement(&parsedOutput, &start); err != nil { return errMalformedXML(err) } found := 0 if !parsedOutput.CSVArgs.IsEmpty() { parsedOutput.format = csvFormat found++ } if !parsedOutput.JSONArgs.IsEmpty() { parsedOutput.format = jsonFormat found++ } if found != 1 { return errObjectSerializationConflict(fmt.Errorf("either CSV or JSON should be present in OutputSerialization")) } *output = OutputSerialization(parsedOutput) output.unmarshaled = true return nil } // RequestProgress - represents elements inside in request XML. type RequestProgress struct { Enabled bool `xml:"Enabled"` } // S3Select - filters the contents on a simple structured query language (SQL) statement. It // represents elements inside in request XML specified in detail at // https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html. type S3Select struct { XMLName xml.Name `xml:"SelectRequest"` Expression string `xml:"Expression"` ExpressionType string `xml:"ExpressionType"` Input InputSerialization `xml:"InputSerialization"` Output OutputSerialization `xml:"OutputSerialization"` Progress RequestProgress `xml:"RequestProgress"` statement *sql.SelectStatement progressReader *progressReader recordReader recordReader close func() error } var ( legacyXMLName = "SelectObjectContentRequest" ) // UnmarshalXML - decodes XML data. func (s3Select *S3Select) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error { // S3 also supports the older SelectObjectContentRequest tag, // though it is no longer found in documentation. This is // checked and renamed below to allow older clients to also // work. if start.Name.Local == legacyXMLName { start.Name = xml.Name{Space: "", Local: "SelectRequest"} } // Make subtype to avoid recursive UnmarshalXML(). type subS3Select S3Select parsedS3Select := subS3Select{} if err := d.DecodeElement(&parsedS3Select, &start); err != nil { if _, ok := err.(*s3Error); ok { return err } return errMalformedXML(err) } parsedS3Select.ExpressionType = strings.ToLower(parsedS3Select.ExpressionType) if parsedS3Select.ExpressionType != "sql" { return errInvalidExpressionType(fmt.Errorf("invalid expression type '%v'", parsedS3Select.ExpressionType)) } if parsedS3Select.Input.IsEmpty() { return errMissingRequiredParameter(fmt.Errorf("InputSerialization must be provided")) } if parsedS3Select.Output.IsEmpty() { return errMissingRequiredParameter(fmt.Errorf("OutputSerialization must be provided")) } statement, err := sql.ParseSelectStatement(parsedS3Select.Expression) if err != nil { return err } parsedS3Select.statement = &statement *s3Select = S3Select(parsedS3Select) return nil } func (s3Select *S3Select) outputRecord() sql.Record { switch s3Select.Output.format { case csvFormat: return csv.NewRecord() case jsonFormat: return json.NewRecord(sql.SelectFmtJSON) } panic(fmt.Errorf("unknown output format '%v'", s3Select.Output.format)) } func (s3Select *S3Select) getProgress() (bytesScanned, bytesProcessed int64) { if s3Select.progressReader != nil { return s3Select.progressReader.Stats() } return -1, -1 } // Open - opens S3 object by using callback for SQL selection query. // Currently CSV, JSON and Apache Parquet formats are supported. func (s3Select *S3Select) Open(getReader func(offset, length int64) (io.ReadCloser, error)) error { switch s3Select.Input.format { case csvFormat: rc, err := getReader(0, -1) if err != nil { return err } s3Select.progressReader, err = newProgressReader(rc, s3Select.Input.CompressionType) if err != nil { rc.Close() return err } s3Select.recordReader, err = csv.NewReader(s3Select.progressReader, &s3Select.Input.CSVArgs) if err != nil { rc.Close() var stErr bzip2.StructuralError if errors.As(err, &stErr) { return errInvalidCompression(err, s3Select.Input.CompressionType) } // Test these compressor errors errs := []error{ gzip.ErrHeader, gzip.ErrChecksum, s2.ErrCorrupt, s2.ErrUnsupported, s2.ErrCRC, zstd.ErrBlockTooSmall, zstd.ErrMagicMismatch, zstd.ErrWindowSizeExceeded, zstd.ErrUnknownDictionary, zstd.ErrWindowSizeTooSmall, lz4.ErrInvalid, lz4.ErrBlockDependency, } for _, e := range errs { if errors.Is(err, e) { return errInvalidCompression(err, s3Select.Input.CompressionType) } } return err } s3Select.close = rc.Close return nil case jsonFormat: rc, err := getReader(0, -1) if err != nil { return err } s3Select.progressReader, err = newProgressReader(rc, s3Select.Input.CompressionType) if err != nil { rc.Close() return err } if strings.EqualFold(s3Select.Input.JSONArgs.ContentType, "lines") { if simdjson.SupportedCPU() { s3Select.recordReader = simdj.NewReader(s3Select.progressReader, &s3Select.Input.JSONArgs) } else { s3Select.recordReader = json.NewPReader(s3Select.progressReader, &s3Select.Input.JSONArgs) } } else { s3Select.recordReader = json.NewReader(s3Select.progressReader, &s3Select.Input.JSONArgs) } s3Select.close = rc.Close return nil case parquetFormat: if !strings.EqualFold(os.Getenv("MINIO_API_SELECT_PARQUET"), "on") { return errors.New("parquet format parsing not enabled on server") } var err error s3Select.recordReader, err = parquet.NewReader(getReader, &s3Select.Input.ParquetArgs) return err } panic(fmt.Errorf("unknown input format '%v'", s3Select.Input.format)) } func (s3Select *S3Select) marshal(buf *bytes.Buffer, record sql.Record) error { switch s3Select.Output.format { case csvFormat: // Use bufio Writer to prevent csv.Writer from allocating a new buffer. bufioWriter := bufioWriterPool.Get().(*bufio.Writer) defer func() { bufioWriter.Reset(ioutil.Discard) bufioWriterPool.Put(bufioWriter) }() bufioWriter.Reset(buf) opts := sql.WriteCSVOpts{ FieldDelimiter: []rune(s3Select.Output.CSVArgs.FieldDelimiter)[0], Quote: []rune(s3Select.Output.CSVArgs.QuoteCharacter)[0], QuoteEscape: []rune(s3Select.Output.CSVArgs.QuoteEscapeCharacter)[0], AlwaysQuote: strings.ToLower(s3Select.Output.CSVArgs.QuoteFields) == "always", } err := record.WriteCSV(bufioWriter, opts) if err != nil { return err } err = bufioWriter.Flush() if err != nil { return err } if buf.Bytes()[buf.Len()-1] == '\n' { buf.Truncate(buf.Len() - 1) } buf.WriteString(s3Select.Output.CSVArgs.RecordDelimiter) return nil case jsonFormat: err := record.WriteJSON(buf) if err != nil { return err } // Trim trailing newline from non-simd output if buf.Bytes()[buf.Len()-1] == '\n' { buf.Truncate(buf.Len() - 1) } buf.WriteString(s3Select.Output.JSONArgs.RecordDelimiter) return nil } panic(fmt.Errorf("unknown output format '%v'", s3Select.Output.format)) } // Evaluate - filters and sends records read from opened reader as per select statement to http response writer. func (s3Select *S3Select) Evaluate(w http.ResponseWriter) { defer func() { if s3Select.close != nil { s3Select.close() } }() getProgressFunc := s3Select.getProgress if !s3Select.Progress.Enabled { getProgressFunc = nil } writer := newMessageWriter(w, getProgressFunc) var outputQueue []sql.Record // Create queue based on the type. if s3Select.statement.IsAggregated() { outputQueue = make([]sql.Record, 0, 1) } else { outputQueue = make([]sql.Record, 0, 100) } var err error sendRecord := func() bool { buf := bufPool.Get().(*bytes.Buffer) buf.Reset() for _, outputRecord := range outputQueue { if outputRecord == nil { continue } before := buf.Len() if err = s3Select.marshal(buf, outputRecord); err != nil { bufPool.Put(buf) return false } if buf.Len()-before > maxRecordSize { writer.FinishWithError("OverMaxRecordSize", "The length of a record in the input or result is greater than maxCharsPerRecord of 1 MB.") bufPool.Put(buf) return false } } if err = writer.SendRecord(buf); err != nil { // FIXME: log this error. err = nil bufPool.Put(buf) return false } outputQueue = outputQueue[:0] return true } var rec sql.Record OuterLoop: for { if s3Select.statement.LimitReached() { if !sendRecord() { break } if err = writer.Finish(s3Select.getProgress()); err != nil { // FIXME: log this error. err = nil } break } if rec, err = s3Select.recordReader.Read(rec); err != nil { if err != io.EOF { break } if s3Select.statement.IsAggregated() { outputRecord := s3Select.outputRecord() if err = s3Select.statement.AggregateResult(outputRecord); err != nil { break } outputQueue = append(outputQueue, outputRecord) } if !sendRecord() { break } if err = writer.Finish(s3Select.getProgress()); err != nil { // FIXME: log this error. err = nil } break } var inputRecords []*sql.Record if inputRecords, err = s3Select.statement.EvalFrom(s3Select.Input.format, rec); err != nil { break } for _, inputRecord := range inputRecords { if s3Select.statement.IsAggregated() { if err = s3Select.statement.AggregateRow(*inputRecord); err != nil { break OuterLoop } } else { var outputRecord sql.Record // We will attempt to reuse the records in the table. // The type of these should not change. // The queue should always have at least one entry left for this to work. outputQueue = outputQueue[:len(outputQueue)+1] if t := outputQueue[len(outputQueue)-1]; t != nil { // If the output record is already set, we reuse it. outputRecord = t outputRecord.Reset() } else { // Create new one outputRecord = s3Select.outputRecord() outputQueue[len(outputQueue)-1] = outputRecord } outputRecord, err = s3Select.statement.Eval(*inputRecord, outputRecord) if outputRecord == nil || err != nil { // This should not be written. // Remove it from the queue. outputQueue = outputQueue[:len(outputQueue)-1] if err != nil { break OuterLoop } continue } outputQueue[len(outputQueue)-1] = outputRecord if s3Select.statement.LimitReached() { if !sendRecord() { break } if err = writer.Finish(s3Select.getProgress()); err != nil { // FIXME: log this error. err = nil } return } if len(outputQueue) < cap(outputQueue) { continue } if !sendRecord() { break OuterLoop } } } } if err != nil { _ = writer.FinishWithError("InternalError", err.Error()) } } // Close - closes opened S3 object. func (s3Select *S3Select) Close() error { return s3Select.recordReader.Close() } // NewS3Select - creates new S3Select by given request XML reader. func NewS3Select(r io.Reader) (*S3Select, error) { s3Select := &S3Select{} if err := xml.NewDecoder(r).Decode(s3Select); err != nil { return nil, err } return s3Select, nil }