diff --git a/docs/select/README.md b/docs/select/README.md index 5f1f9d8bd..3e3a8cda9 100644 --- a/docs/select/README.md +++ b/docs/select/README.md @@ -3,13 +3,26 @@ Traditional retrieval of objects is always as whole entities, i.e GetObject for You can use the Select API to query objects with following features: -- CSV, JSON and Parquet - Objects must be in CSV, JSON, or Parquet format. +- Objects must be in CSV, JSON, or Parquet(*) format. - UTF-8 is the only encoding type the Select API supports. - GZIP or BZIP2 - CSV and JSON files can be compressed using GZIP or BZIP2. The Select API supports columnar compression for Parquet using GZIP, Snappy, LZ4. Whole object compression is not supported for Parquet objects. - Server-side encryption - The Select API supports querying objects that are protected with server-side encryption. Type inference and automatic conversion of values is performed based on the context when the value is un-typed (such as when reading CSV data). If present, the CAST function overrides automatic conversion. +The [mc sql](https://docs.min.io/docs/minio-client-complete-guide.html#sql) command can be used for executing queries using the command line. + +(*) Parquet is disabled on the MinIO server by default. See below how to enable it. + +## Enabling Parquet Format + +Parquet is DISABLED by default since hostile crafted input can easily crash the server. + +If you are in a controlled environment where it is safe to assume no hostile content can be uploaded to your cluster you can safely enable Parquet. +To enable Parquet set the environment variable `MINIO_API_SELECT_PARQUET=on`. + +# Example using Python API + ## 1. Prerequisites - Install MinIO Server from [here](http://docs.min.io/docs/minio-quickstart-guide). - Familiarity with AWS S3 API. diff --git a/pkg/s3select/parquet/reader.go b/pkg/s3select/parquet/reader.go index fd7ae1e6b..a3140ed8a 100644 --- a/pkg/s3select/parquet/reader.go +++ b/pkg/s3select/parquet/reader.go @@ -17,6 +17,7 @@ package parquet import ( + "fmt" "io" "github.com/bcicen/jstream" @@ -34,6 +35,12 @@ type Reader struct { // Read - reads single record. func (r *Reader) Read(dst sql.Record) (rec sql.Record, rerr error) { + defer func() { + if rec := recover(); rec != nil { + rerr = fmt.Errorf("panic reading parquet record: %v", rec) + } + }() + parquetRecord, err := r.reader.Read() if err != nil { if err != io.EOF { @@ -92,7 +99,12 @@ func (r *Reader) Close() error { } // NewReader - creates new Parquet reader using readerFunc callback. -func NewReader(getReaderFunc func(offset, length int64) (io.ReadCloser, error), args *ReaderArgs) (*Reader, error) { +func NewReader(getReaderFunc func(offset, length int64) (io.ReadCloser, error), args *ReaderArgs) (r *Reader, err error) { + defer func() { + if rec := recover(); rec != nil { + err = fmt.Errorf("panic reading parquet header: %v", rec) + } + }() reader, err := parquetgo.NewReader(getReaderFunc, nil) if err != nil { if err != io.EOF { diff --git a/pkg/s3select/select.go b/pkg/s3select/select.go index f76e10de6..cbf48cbf9 100644 --- a/pkg/s3select/select.go +++ b/pkg/s3select/select.go @@ -26,6 +26,7 @@ import ( "io" "io/ioutil" "net/http" + "os" "strings" "sync" @@ -334,6 +335,9 @@ func (s3Select *S3Select) Open(getReader func(offset, length int64) (io.ReadClos } return nil case parquetFormat: + if !strings.EqualFold(os.Getenv("MINIO_API_SELECT_PARQUET"), "on") { + return errors.New("parquet format parsing not enabled on server") + } var err error s3Select.recordReader, err = parquet.NewReader(getReader, &s3Select.Input.ParquetArgs) return err diff --git a/pkg/s3select/select_test.go b/pkg/s3select/select_test.go index dea51e05b..2c5ae6e88 100644 --- a/pkg/s3select/select_test.go +++ b/pkg/s3select/select_test.go @@ -925,6 +925,8 @@ func TestJSONInput(t *testing.T) { } func TestParquetInput(t *testing.T) { + os.Setenv("MINIO_API_SELECT_PARQUET", "on") + defer os.Setenv("MINIO_API_SELECT_PARQUET", "off") var testTable = []struct { requestXML []byte