Support JSON to CSV and CSV to JSON output format conversion (#6910)

This PR implements one of the pending items in issue #6286 in S3 API a user can request CSV output for a JSON document and a JSON output for a CSV document. This PR refactors the code a little bit to bring this feature.
2025-10-28 23:35:01 -04:00 · 2018-12-07 14:55:32 -08:00 · 2018-12-07 14:55:32 -08:00 · 4c7c571875
commit 4c7c571875
parent 313ba74b09
6 changed files with 150 additions and 56 deletions
--- a/cmd/object-handlers.go
+++ b/cmd/object-handlers.go
@ -218,11 +218,13 @@ func (api objectAPIHandlers) SelectObjectContentHandler(w http.ResponseWriter, r
 			writeErrorResponse(w, ErrInvalidFileHeaderInfo, r.URL, guessIsBrowserReq(r))
 			return
 		}
-		if selectReq.OutputSerialization.CSV.QuoteFields != s3select.CSVQuoteFieldsAlways &&
+		if selectReq.OutputSerialization.CSV != nil {
-			selectReq.OutputSerialization.CSV.QuoteFields != s3select.CSVQuoteFieldsAsNeeded &&
+			if selectReq.OutputSerialization.CSV.QuoteFields != s3select.CSVQuoteFieldsAlways &&
-			selectReq.OutputSerialization.CSV.QuoteFields != "" {
+				selectReq.OutputSerialization.CSV.QuoteFields != s3select.CSVQuoteFieldsAsNeeded &&
-			writeErrorResponse(w, ErrInvalidQuoteFields, r.URL, guessIsBrowserReq(r))
+				selectReq.OutputSerialization.CSV.QuoteFields != "" {
-			return
+				writeErrorResponse(w, ErrInvalidQuoteFields, r.URL, guessIsBrowserReq(r))
 				return
 			}
 		}
 		if len(selectReq.InputSerialization.CSV.RecordDelimiter) > 2 {
 			writeErrorResponse(w, ErrInvalidRequestParameter, r.URL, guessIsBrowserReq(r))
--- a/pkg/s3select/format/csv/csv.go
+++ b/pkg/s3select/format/csv/csv.go
@ -57,9 +57,12 @@ type Options struct {
 	// SQL expression meant to be evaluated.
 	Expression string
-	// What the outputted CSV will be delimited by .
+	// Output CSV will be delimited by.
 	OutputFieldDelimiter string
 	// Output CSV record will be delimited by.
 	OutputRecordDelimiter string
 	// Size of incoming object
 	StreamSize int64
@ -68,6 +71,9 @@ type Options struct {
 	// Progress enabled, enable/disable progress messages.
 	Progress bool
 	// Output format type, supported values are CSV and JSON
 	OutputType format.Type
 }
 // cinput represents a record producing input from a formatted object.
@ -147,6 +153,9 @@ func (reader *cinput) readHeader() error {
 		reader.firstRow = nil
 	} else {
 		reader.firstRow, readErr = reader.reader.Read()
 		if readErr != nil {
 			return format.ErrCSVParsingError
 		}
 		reader.header = make([]string, len(reader.firstRow))
 		for i := range reader.firstRow {
 			reader.header[i] = "_" + strconv.Itoa(i)
@ -173,8 +182,13 @@ func (reader *cinput) Read() ([]byte, error) {
 	if dec != nil {
 		var data []byte
 		var err error
-		for i, value := range dec {
+		// Navigate column values in reverse order to preserve
-			data, err = sjson.SetBytes(data, reader.header[i], value)
+		// the input order for AWS S3 compatibility, because
 		// sjson adds json key/value pairs in first in last out
 		// fashion. This should be fixed in sjson ideally. Following
 		// work around is needed to circumvent this issue for now.
 		for i := len(dec) - 1; i >= 0; i-- {
 			data, err = sjson.SetBytes(data, reader.header[i], dec[i])
 			if err != nil {
 				return nil, err
 			}
@ -184,11 +198,16 @@ func (reader *cinput) Read() ([]byte, error) {
 	return nil, nil
 }
-// OutputFieldDelimiter - returns the delimiter specified in input request
+// OutputFieldDelimiter - returns the requested output field delimiter.
 func (reader *cinput) OutputFieldDelimiter() string {
 	return reader.options.OutputFieldDelimiter
 }
 // OutputRecordDelimiter - returns the requested output record delimiter.
 func (reader *cinput) OutputRecordDelimiter() string {
 	return reader.options.OutputFieldDelimiter
 }
 // HasHeader - returns true or false depending upon the header.
 func (reader *cinput) HasHeader() bool {
 	return reader.options.HasHeader
@ -285,11 +304,16 @@ func (reader *cinput) CreateProgressXML() (string, error) {
 	return xml.Header + string(out), nil
 }
-// Type - return the data format type {
+// Type - return the data format type
 func (reader *cinput) Type() format.Type {
 	return format.CSV
 }
 // OutputType - return the data format type
 func (reader *cinput) OutputType() format.Type {
 	return reader.options.OutputType
 }
 // ColNameErrs is a function which makes sure that the headers are requested are
 // present in the file otherwise it throws an error.
 func (reader *cinput) ColNameErrs(columnNames []string) error {
--- a/pkg/s3select/format/json/json.go
+++ b/pkg/s3select/format/json/json.go
@ -22,6 +22,7 @@ import (
 	"io"
 	"github.com/minio/minio/pkg/s3select/format"
 	"github.com/tidwall/gjson"
 )
 // Options options are passed to the underlying encoding/json reader.
@ -40,24 +41,32 @@ type Options struct {
 	// SQL expression meant to be evaluated.
 	Expression string
-	// What the outputted  will be delimited by .
+	// Input record delimiter.
 	RecordDelimiter string
 	// Output CSV will be delimited by.
 	OutputFieldDelimiter string
 	// Output record delimiter.
 	OutputRecordDelimiter string
 	// Size of incoming object
 	StreamSize int64
-	// True if Type is DOCUMENTS
+	// True if DocumentType is DOCUMENTS
-	Type bool
+	DocumentType bool
 	// Progress enabled, enable/disable progress messages.
 	Progress bool
 	// Output format type, supported values are CSV and JSON
 	OutputType format.Type
 }
 // jinput represents a record producing input from a  formatted file or pipe.
 type jinput struct {
 	options         *Options
 	reader          *bufio.Reader
 	firstRow        []string
 	header          []string
 	minOutputLength int
 	stats           struct {
@ -79,7 +88,6 @@ func New(opts *Options) (format.Select, error) {
 	reader.stats.BytesScanned = opts.StreamSize
 	reader.stats.BytesProcessed = 0
 	reader.stats.BytesReturned = 0
 	return reader, nil
 }
@ -95,7 +103,7 @@ func (reader *jinput) UpdateBytesProcessed(size int64) {
 // Read the file and returns
 func (reader *jinput) Read() ([]byte, error) {
-	data, err := reader.reader.ReadBytes('\n')
+	data, _, err := reader.reader.ReadLine()
 	if err != nil {
 		if err == io.EOF || err == io.ErrClosedPipe {
 			err = nil
@ -103,17 +111,32 @@ func (reader *jinput) Read() ([]byte, error) {
 			err = format.ErrJSONParsingError
 		}
 	}
 	if err == nil {
 		var header []string
 		gjson.ParseBytes(data).ForEach(func(key, value gjson.Result) bool {
 			header = append(header, key.String())
 			return true
 		})
 		reader.header = header
 	}
 	return data, err
 }
-// OutputFieldDelimiter - returns the delimiter specified in input request
+// OutputFieldDelimiter - returns the delimiter specified in input request,
 // for JSON output this value is empty, but does have a value when
 // output type is CSV.
 func (reader *jinput) OutputFieldDelimiter() string {
-	return ","
+	return reader.options.OutputFieldDelimiter
 }
 // OutputRecordDelimiter - returns the delimiter specified in input request, after each JSON record.
 func (reader *jinput) OutputRecordDelimiter() string {
 	return reader.options.OutputRecordDelimiter
 }
 // HasHeader - returns true or false depending upon the header.
 func (reader *jinput) HasHeader() bool {
-	return false
+	return true
 }
 // Expression - return the Select Expression for
@ -128,7 +151,7 @@ func (reader *jinput) UpdateBytesReturned(size int64) {
 // Header returns a nil in case of
 func (reader *jinput) Header() []string {
-	return nil
+	return reader.header
 }
 // CreateStatXML is the function which does the marshaling from the stat
@ -171,6 +194,11 @@ func (reader *jinput) Type() format.Type {
 	return format.JSON
 }
 // OutputType - return the data format type {
 func (reader *jinput) OutputType() format.Type {
 	return reader.options.OutputType
 }
 // ColNameErrs - this is a dummy function for JSON input type.
 func (reader *jinput) ColNameErrs(columnNames []string) error {
 	return nil
--- a/pkg/s3select/format/select.go
+++ b/pkg/s3select/format/select.go
@ -22,10 +22,12 @@ import "encoding/xml"
 // https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
 type Select interface {
 	Type() Type
 	OutputType() Type
 	Read() ([]byte, error)
 	Header() []string
 	HasHeader() bool
 	OutputFieldDelimiter() string
 	OutputRecordDelimiter() string
 	UpdateBytesProcessed(int64)
 	Expression() string
 	UpdateBytesReturned(int64)
--- a/pkg/s3select/input.go
+++ b/pkg/s3select/input.go
@ -65,40 +65,60 @@ func New(reader io.Reader, size int64, req ObjectSelectRequest) (s3s format.Sele
 	//  Initializating options for CSV
 	if req.InputSerialization.CSV != nil {
 		if req.OutputSerialization.CSV.FieldDelimiter == "" {
 			req.OutputSerialization.CSV.FieldDelimiter = ","
 		}
 		if req.InputSerialization.CSV.FileHeaderInfo == "" {
 			req.InputSerialization.CSV.FileHeaderInfo = CSVFileHeaderInfoNone
 		}
 		if req.InputSerialization.CSV.RecordDelimiter == "" {
 			req.InputSerialization.CSV.RecordDelimiter = "\n"
 		}
-		s3s, err = csv.New(&csv.Options{
+		options := &csv.Options{
-			HasHeader:            req.InputSerialization.CSV.FileHeaderInfo == CSVFileHeaderInfoUse,
+			Name:            "S3Object", // Default table name for all objects
-			RecordDelimiter:      req.InputSerialization.CSV.RecordDelimiter,
+			HasHeader:       req.InputSerialization.CSV.FileHeaderInfo == CSVFileHeaderInfoUse,
-			FieldDelimiter:       req.InputSerialization.CSV.FieldDelimiter,
+			RecordDelimiter: req.InputSerialization.CSV.RecordDelimiter,
-			Comments:             req.InputSerialization.CSV.Comments,
+			FieldDelimiter:  req.InputSerialization.CSV.FieldDelimiter,
-			Name:                 "S3Object", // Default table name for all objects
+			Comments:        req.InputSerialization.CSV.Comments,
-			ReadFrom:             reader,
+			ReadFrom:        reader,
-			Compressed:           string(req.InputSerialization.CompressionType),
+			Compressed:      string(req.InputSerialization.CompressionType),
-			Expression:           cleanExpr(req.Expression),
+			Expression:      cleanExpr(req.Expression),
-			OutputFieldDelimiter: req.OutputSerialization.CSV.FieldDelimiter,
+			StreamSize:      size,
-			StreamSize:           size,
+			HeaderOpt:       req.InputSerialization.CSV.FileHeaderInfo == CSVFileHeaderInfoUse,
-			HeaderOpt:            req.InputSerialization.CSV.FileHeaderInfo == CSVFileHeaderInfoUse,
+			Progress:        req.RequestProgress.Enabled,
-			Progress:             req.RequestProgress.Enabled,
+		}
-		})
+		if req.OutputSerialization.CSV != nil {
 			if req.OutputSerialization.CSV.FieldDelimiter == "" {
 				req.OutputSerialization.CSV.FieldDelimiter = ","
 			}
 			options.OutputFieldDelimiter = req.OutputSerialization.CSV.FieldDelimiter
 			options.OutputRecordDelimiter = req.OutputSerialization.CSV.RecordDelimiter
 			options.OutputType = format.CSV
 		}
 		if req.OutputSerialization.JSON != nil {
 			options.OutputRecordDelimiter = req.OutputSerialization.JSON.RecordDelimiter
 			options.OutputType = format.JSON
 		}
 		// Initialize CSV input type
 		s3s, err = csv.New(options)
 	} else if req.InputSerialization.JSON != nil {
-		//  Initializating options for JSON
+		options := &json.Options{
-		s3s, err = json.New(&json.Options{
+			Name:         "S3Object", // Default table name for all objects
-			Name:       "S3Object", // Default table name for all objects
+			ReadFrom:     reader,
-			ReadFrom:   reader,
+			Compressed:   string(req.InputSerialization.CompressionType),
-			Compressed: string(req.InputSerialization.CompressionType),
+			Expression:   cleanExpr(req.Expression),
-			Expression: cleanExpr(req.Expression),
+			StreamSize:   size,
-			StreamSize: size,
+			DocumentType: req.InputSerialization.JSON.Type == JSONTypeDocument,
-			Type:       req.InputSerialization.JSON.Type == JSONTypeDocument,
+			Progress:     req.RequestProgress.Enabled,
-			Progress:   req.RequestProgress.Enabled,
+		}
-		})
+		if req.OutputSerialization.JSON != nil {
 			options.OutputRecordDelimiter = req.OutputSerialization.JSON.RecordDelimiter
 			options.OutputType = format.JSON
 		}
 		if req.OutputSerialization.CSV != nil {
 			options.OutputFieldDelimiter = req.OutputSerialization.CSV.FieldDelimiter
 			options.OutputRecordDelimiter = req.OutputSerialization.CSV.RecordDelimiter
 			options.OutputType = format.CSV
 		}
 		// Initialize JSON input type
 		s3s, err = json.New(options)
 	}
 	return s3s, err
 }
--- a/pkg/s3select/select.go
+++ b/pkg/s3select/select.go
@ -201,14 +201,19 @@ func processSelectReq(reqColNames []string, alias string, wc sqlparser.Expr, lre
 		lrecords = math.MaxInt64
 	}
-	columnsKv, err := columnsIndex(reqColNames, f)
+	var results []string
-	if err != nil {
+	var columnsKv []columnKv
-		rowCh <- Row{
+	if f.Type() == format.CSV {
-			err: err,
+		var err error
 		columnsKv, err = columnsIndex(reqColNames, f)
 		if err != nil {
 			rowCh <- Row{
 				err: err,
 			}
 			return
 		}
-		return
+		results = make([]string, len(columnsKv))
 	}
 	var results = make([]string, len(columnsKv))
 	for {
 		record, err := f.Read()
@ -228,6 +233,19 @@ func processSelectReq(reqColNames []string, alias string, wc sqlparser.Expr, lre
 			return
 		}
 		// For JSON multi-line input type columns needs
 		// to be handled for each record.
 		if f.Type() == format.JSON {
 			columnsKv, err = columnsIndex(reqColNames, f)
 			if err != nil {
 				rowCh <- Row{
 					err: err,
 				}
 				return
 			}
 			results = make([]string, len(columnsKv))
 		}
 		f.UpdateBytesProcessed(int64(len(record)))
 		// Return in case the number of record reaches the LIMIT
@ -250,17 +268,17 @@ func processSelectReq(reqColNames []string, alias string, wc sqlparser.Expr, lre
 		if condition {
 			// if its an asterix we just print everything in the row
 			if reqColNames[0] == "*" && fnNames[0] == "" {
-				switch f.Type() {
+				switch f.OutputType() {
 				case format.CSV:
 					for i, kv := range columnsKv {
 						results[i] = gjson.GetBytes(record, kv.Key).String()
 					}
 					rowCh <- Row{
-						record: strings.Join(results, f.OutputFieldDelimiter()) + "\n",
+						record: strings.Join(results, f.OutputFieldDelimiter()) + f.OutputRecordDelimiter(),
 					}
 				case format.JSON:
 					rowCh <- Row{
-						record: string(record) + "\n",
+						record: string(record) + f.OutputRecordDelimiter(),
 					}
 				}
 			} else if alias != "" {