mirror of
https://github.com/minio/minio.git
synced 2025-01-25 21:53:16 -05:00
Support JSON to CSV and CSV to JSON output format conversion (#6910)
This PR implements one of the pending items in issue #6286 in S3 API a user can request CSV output for a JSON document and a JSON output for a CSV document. This PR refactors the code a little bit to bring this feature.
This commit is contained in:
parent
313ba74b09
commit
4c7c571875
@ -218,11 +218,13 @@ func (api objectAPIHandlers) SelectObjectContentHandler(w http.ResponseWriter, r
|
|||||||
writeErrorResponse(w, ErrInvalidFileHeaderInfo, r.URL, guessIsBrowserReq(r))
|
writeErrorResponse(w, ErrInvalidFileHeaderInfo, r.URL, guessIsBrowserReq(r))
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if selectReq.OutputSerialization.CSV.QuoteFields != s3select.CSVQuoteFieldsAlways &&
|
if selectReq.OutputSerialization.CSV != nil {
|
||||||
selectReq.OutputSerialization.CSV.QuoteFields != s3select.CSVQuoteFieldsAsNeeded &&
|
if selectReq.OutputSerialization.CSV.QuoteFields != s3select.CSVQuoteFieldsAlways &&
|
||||||
selectReq.OutputSerialization.CSV.QuoteFields != "" {
|
selectReq.OutputSerialization.CSV.QuoteFields != s3select.CSVQuoteFieldsAsNeeded &&
|
||||||
writeErrorResponse(w, ErrInvalidQuoteFields, r.URL, guessIsBrowserReq(r))
|
selectReq.OutputSerialization.CSV.QuoteFields != "" {
|
||||||
return
|
writeErrorResponse(w, ErrInvalidQuoteFields, r.URL, guessIsBrowserReq(r))
|
||||||
|
return
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if len(selectReq.InputSerialization.CSV.RecordDelimiter) > 2 {
|
if len(selectReq.InputSerialization.CSV.RecordDelimiter) > 2 {
|
||||||
writeErrorResponse(w, ErrInvalidRequestParameter, r.URL, guessIsBrowserReq(r))
|
writeErrorResponse(w, ErrInvalidRequestParameter, r.URL, guessIsBrowserReq(r))
|
||||||
|
@ -57,9 +57,12 @@ type Options struct {
|
|||||||
// SQL expression meant to be evaluated.
|
// SQL expression meant to be evaluated.
|
||||||
Expression string
|
Expression string
|
||||||
|
|
||||||
// What the outputted CSV will be delimited by .
|
// Output CSV will be delimited by.
|
||||||
OutputFieldDelimiter string
|
OutputFieldDelimiter string
|
||||||
|
|
||||||
|
// Output CSV record will be delimited by.
|
||||||
|
OutputRecordDelimiter string
|
||||||
|
|
||||||
// Size of incoming object
|
// Size of incoming object
|
||||||
StreamSize int64
|
StreamSize int64
|
||||||
|
|
||||||
@ -68,6 +71,9 @@ type Options struct {
|
|||||||
|
|
||||||
// Progress enabled, enable/disable progress messages.
|
// Progress enabled, enable/disable progress messages.
|
||||||
Progress bool
|
Progress bool
|
||||||
|
|
||||||
|
// Output format type, supported values are CSV and JSON
|
||||||
|
OutputType format.Type
|
||||||
}
|
}
|
||||||
|
|
||||||
// cinput represents a record producing input from a formatted object.
|
// cinput represents a record producing input from a formatted object.
|
||||||
@ -147,6 +153,9 @@ func (reader *cinput) readHeader() error {
|
|||||||
reader.firstRow = nil
|
reader.firstRow = nil
|
||||||
} else {
|
} else {
|
||||||
reader.firstRow, readErr = reader.reader.Read()
|
reader.firstRow, readErr = reader.reader.Read()
|
||||||
|
if readErr != nil {
|
||||||
|
return format.ErrCSVParsingError
|
||||||
|
}
|
||||||
reader.header = make([]string, len(reader.firstRow))
|
reader.header = make([]string, len(reader.firstRow))
|
||||||
for i := range reader.firstRow {
|
for i := range reader.firstRow {
|
||||||
reader.header[i] = "_" + strconv.Itoa(i)
|
reader.header[i] = "_" + strconv.Itoa(i)
|
||||||
@ -173,8 +182,13 @@ func (reader *cinput) Read() ([]byte, error) {
|
|||||||
if dec != nil {
|
if dec != nil {
|
||||||
var data []byte
|
var data []byte
|
||||||
var err error
|
var err error
|
||||||
for i, value := range dec {
|
// Navigate column values in reverse order to preserve
|
||||||
data, err = sjson.SetBytes(data, reader.header[i], value)
|
// the input order for AWS S3 compatibility, because
|
||||||
|
// sjson adds json key/value pairs in first in last out
|
||||||
|
// fashion. This should be fixed in sjson ideally. Following
|
||||||
|
// work around is needed to circumvent this issue for now.
|
||||||
|
for i := len(dec) - 1; i >= 0; i-- {
|
||||||
|
data, err = sjson.SetBytes(data, reader.header[i], dec[i])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -184,11 +198,16 @@ func (reader *cinput) Read() ([]byte, error) {
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// OutputFieldDelimiter - returns the delimiter specified in input request
|
// OutputFieldDelimiter - returns the requested output field delimiter.
|
||||||
func (reader *cinput) OutputFieldDelimiter() string {
|
func (reader *cinput) OutputFieldDelimiter() string {
|
||||||
return reader.options.OutputFieldDelimiter
|
return reader.options.OutputFieldDelimiter
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// OutputRecordDelimiter - returns the requested output record delimiter.
|
||||||
|
func (reader *cinput) OutputRecordDelimiter() string {
|
||||||
|
return reader.options.OutputFieldDelimiter
|
||||||
|
}
|
||||||
|
|
||||||
// HasHeader - returns true or false depending upon the header.
|
// HasHeader - returns true or false depending upon the header.
|
||||||
func (reader *cinput) HasHeader() bool {
|
func (reader *cinput) HasHeader() bool {
|
||||||
return reader.options.HasHeader
|
return reader.options.HasHeader
|
||||||
@ -285,11 +304,16 @@ func (reader *cinput) CreateProgressXML() (string, error) {
|
|||||||
return xml.Header + string(out), nil
|
return xml.Header + string(out), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Type - return the data format type {
|
// Type - return the data format type
|
||||||
func (reader *cinput) Type() format.Type {
|
func (reader *cinput) Type() format.Type {
|
||||||
return format.CSV
|
return format.CSV
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// OutputType - return the data format type
|
||||||
|
func (reader *cinput) OutputType() format.Type {
|
||||||
|
return reader.options.OutputType
|
||||||
|
}
|
||||||
|
|
||||||
// ColNameErrs is a function which makes sure that the headers are requested are
|
// ColNameErrs is a function which makes sure that the headers are requested are
|
||||||
// present in the file otherwise it throws an error.
|
// present in the file otherwise it throws an error.
|
||||||
func (reader *cinput) ColNameErrs(columnNames []string) error {
|
func (reader *cinput) ColNameErrs(columnNames []string) error {
|
||||||
|
@ -22,6 +22,7 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
|
|
||||||
"github.com/minio/minio/pkg/s3select/format"
|
"github.com/minio/minio/pkg/s3select/format"
|
||||||
|
"github.com/tidwall/gjson"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Options options are passed to the underlying encoding/json reader.
|
// Options options are passed to the underlying encoding/json reader.
|
||||||
@ -40,24 +41,32 @@ type Options struct {
|
|||||||
// SQL expression meant to be evaluated.
|
// SQL expression meant to be evaluated.
|
||||||
Expression string
|
Expression string
|
||||||
|
|
||||||
// What the outputted will be delimited by .
|
// Input record delimiter.
|
||||||
RecordDelimiter string
|
RecordDelimiter string
|
||||||
|
|
||||||
|
// Output CSV will be delimited by.
|
||||||
|
OutputFieldDelimiter string
|
||||||
|
|
||||||
|
// Output record delimiter.
|
||||||
|
OutputRecordDelimiter string
|
||||||
|
|
||||||
// Size of incoming object
|
// Size of incoming object
|
||||||
StreamSize int64
|
StreamSize int64
|
||||||
|
|
||||||
// True if Type is DOCUMENTS
|
// True if DocumentType is DOCUMENTS
|
||||||
Type bool
|
DocumentType bool
|
||||||
|
|
||||||
// Progress enabled, enable/disable progress messages.
|
// Progress enabled, enable/disable progress messages.
|
||||||
Progress bool
|
Progress bool
|
||||||
|
|
||||||
|
// Output format type, supported values are CSV and JSON
|
||||||
|
OutputType format.Type
|
||||||
}
|
}
|
||||||
|
|
||||||
// jinput represents a record producing input from a formatted file or pipe.
|
// jinput represents a record producing input from a formatted file or pipe.
|
||||||
type jinput struct {
|
type jinput struct {
|
||||||
options *Options
|
options *Options
|
||||||
reader *bufio.Reader
|
reader *bufio.Reader
|
||||||
firstRow []string
|
|
||||||
header []string
|
header []string
|
||||||
minOutputLength int
|
minOutputLength int
|
||||||
stats struct {
|
stats struct {
|
||||||
@ -79,7 +88,6 @@ func New(opts *Options) (format.Select, error) {
|
|||||||
reader.stats.BytesScanned = opts.StreamSize
|
reader.stats.BytesScanned = opts.StreamSize
|
||||||
reader.stats.BytesProcessed = 0
|
reader.stats.BytesProcessed = 0
|
||||||
reader.stats.BytesReturned = 0
|
reader.stats.BytesReturned = 0
|
||||||
|
|
||||||
return reader, nil
|
return reader, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -95,7 +103,7 @@ func (reader *jinput) UpdateBytesProcessed(size int64) {
|
|||||||
|
|
||||||
// Read the file and returns
|
// Read the file and returns
|
||||||
func (reader *jinput) Read() ([]byte, error) {
|
func (reader *jinput) Read() ([]byte, error) {
|
||||||
data, err := reader.reader.ReadBytes('\n')
|
data, _, err := reader.reader.ReadLine()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if err == io.EOF || err == io.ErrClosedPipe {
|
if err == io.EOF || err == io.ErrClosedPipe {
|
||||||
err = nil
|
err = nil
|
||||||
@ -103,17 +111,32 @@ func (reader *jinput) Read() ([]byte, error) {
|
|||||||
err = format.ErrJSONParsingError
|
err = format.ErrJSONParsingError
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if err == nil {
|
||||||
|
var header []string
|
||||||
|
gjson.ParseBytes(data).ForEach(func(key, value gjson.Result) bool {
|
||||||
|
header = append(header, key.String())
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
reader.header = header
|
||||||
|
}
|
||||||
return data, err
|
return data, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// OutputFieldDelimiter - returns the delimiter specified in input request
|
// OutputFieldDelimiter - returns the delimiter specified in input request,
|
||||||
|
// for JSON output this value is empty, but does have a value when
|
||||||
|
// output type is CSV.
|
||||||
func (reader *jinput) OutputFieldDelimiter() string {
|
func (reader *jinput) OutputFieldDelimiter() string {
|
||||||
return ","
|
return reader.options.OutputFieldDelimiter
|
||||||
|
}
|
||||||
|
|
||||||
|
// OutputRecordDelimiter - returns the delimiter specified in input request, after each JSON record.
|
||||||
|
func (reader *jinput) OutputRecordDelimiter() string {
|
||||||
|
return reader.options.OutputRecordDelimiter
|
||||||
}
|
}
|
||||||
|
|
||||||
// HasHeader - returns true or false depending upon the header.
|
// HasHeader - returns true or false depending upon the header.
|
||||||
func (reader *jinput) HasHeader() bool {
|
func (reader *jinput) HasHeader() bool {
|
||||||
return false
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
// Expression - return the Select Expression for
|
// Expression - return the Select Expression for
|
||||||
@ -128,7 +151,7 @@ func (reader *jinput) UpdateBytesReturned(size int64) {
|
|||||||
|
|
||||||
// Header returns a nil in case of
|
// Header returns a nil in case of
|
||||||
func (reader *jinput) Header() []string {
|
func (reader *jinput) Header() []string {
|
||||||
return nil
|
return reader.header
|
||||||
}
|
}
|
||||||
|
|
||||||
// CreateStatXML is the function which does the marshaling from the stat
|
// CreateStatXML is the function which does the marshaling from the stat
|
||||||
@ -171,6 +194,11 @@ func (reader *jinput) Type() format.Type {
|
|||||||
return format.JSON
|
return format.JSON
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// OutputType - return the data format type {
|
||||||
|
func (reader *jinput) OutputType() format.Type {
|
||||||
|
return reader.options.OutputType
|
||||||
|
}
|
||||||
|
|
||||||
// ColNameErrs - this is a dummy function for JSON input type.
|
// ColNameErrs - this is a dummy function for JSON input type.
|
||||||
func (reader *jinput) ColNameErrs(columnNames []string) error {
|
func (reader *jinput) ColNameErrs(columnNames []string) error {
|
||||||
return nil
|
return nil
|
||||||
|
@ -22,10 +22,12 @@ import "encoding/xml"
|
|||||||
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
|
// https://docs.aws.amazon.com/AmazonS3/latest/API/RESTObjectSELECTContent.html
|
||||||
type Select interface {
|
type Select interface {
|
||||||
Type() Type
|
Type() Type
|
||||||
|
OutputType() Type
|
||||||
Read() ([]byte, error)
|
Read() ([]byte, error)
|
||||||
Header() []string
|
Header() []string
|
||||||
HasHeader() bool
|
HasHeader() bool
|
||||||
OutputFieldDelimiter() string
|
OutputFieldDelimiter() string
|
||||||
|
OutputRecordDelimiter() string
|
||||||
UpdateBytesProcessed(int64)
|
UpdateBytesProcessed(int64)
|
||||||
Expression() string
|
Expression() string
|
||||||
UpdateBytesReturned(int64)
|
UpdateBytesReturned(int64)
|
||||||
|
@ -65,40 +65,60 @@ func New(reader io.Reader, size int64, req ObjectSelectRequest) (s3s format.Sele
|
|||||||
|
|
||||||
// Initializating options for CSV
|
// Initializating options for CSV
|
||||||
if req.InputSerialization.CSV != nil {
|
if req.InputSerialization.CSV != nil {
|
||||||
if req.OutputSerialization.CSV.FieldDelimiter == "" {
|
|
||||||
req.OutputSerialization.CSV.FieldDelimiter = ","
|
|
||||||
}
|
|
||||||
if req.InputSerialization.CSV.FileHeaderInfo == "" {
|
if req.InputSerialization.CSV.FileHeaderInfo == "" {
|
||||||
req.InputSerialization.CSV.FileHeaderInfo = CSVFileHeaderInfoNone
|
req.InputSerialization.CSV.FileHeaderInfo = CSVFileHeaderInfoNone
|
||||||
}
|
}
|
||||||
if req.InputSerialization.CSV.RecordDelimiter == "" {
|
if req.InputSerialization.CSV.RecordDelimiter == "" {
|
||||||
req.InputSerialization.CSV.RecordDelimiter = "\n"
|
req.InputSerialization.CSV.RecordDelimiter = "\n"
|
||||||
}
|
}
|
||||||
s3s, err = csv.New(&csv.Options{
|
options := &csv.Options{
|
||||||
HasHeader: req.InputSerialization.CSV.FileHeaderInfo == CSVFileHeaderInfoUse,
|
Name: "S3Object", // Default table name for all objects
|
||||||
RecordDelimiter: req.InputSerialization.CSV.RecordDelimiter,
|
HasHeader: req.InputSerialization.CSV.FileHeaderInfo == CSVFileHeaderInfoUse,
|
||||||
FieldDelimiter: req.InputSerialization.CSV.FieldDelimiter,
|
RecordDelimiter: req.InputSerialization.CSV.RecordDelimiter,
|
||||||
Comments: req.InputSerialization.CSV.Comments,
|
FieldDelimiter: req.InputSerialization.CSV.FieldDelimiter,
|
||||||
Name: "S3Object", // Default table name for all objects
|
Comments: req.InputSerialization.CSV.Comments,
|
||||||
ReadFrom: reader,
|
ReadFrom: reader,
|
||||||
Compressed: string(req.InputSerialization.CompressionType),
|
Compressed: string(req.InputSerialization.CompressionType),
|
||||||
Expression: cleanExpr(req.Expression),
|
Expression: cleanExpr(req.Expression),
|
||||||
OutputFieldDelimiter: req.OutputSerialization.CSV.FieldDelimiter,
|
StreamSize: size,
|
||||||
StreamSize: size,
|
HeaderOpt: req.InputSerialization.CSV.FileHeaderInfo == CSVFileHeaderInfoUse,
|
||||||
HeaderOpt: req.InputSerialization.CSV.FileHeaderInfo == CSVFileHeaderInfoUse,
|
Progress: req.RequestProgress.Enabled,
|
||||||
Progress: req.RequestProgress.Enabled,
|
}
|
||||||
})
|
if req.OutputSerialization.CSV != nil {
|
||||||
|
if req.OutputSerialization.CSV.FieldDelimiter == "" {
|
||||||
|
req.OutputSerialization.CSV.FieldDelimiter = ","
|
||||||
|
}
|
||||||
|
options.OutputFieldDelimiter = req.OutputSerialization.CSV.FieldDelimiter
|
||||||
|
options.OutputRecordDelimiter = req.OutputSerialization.CSV.RecordDelimiter
|
||||||
|
options.OutputType = format.CSV
|
||||||
|
}
|
||||||
|
if req.OutputSerialization.JSON != nil {
|
||||||
|
options.OutputRecordDelimiter = req.OutputSerialization.JSON.RecordDelimiter
|
||||||
|
options.OutputType = format.JSON
|
||||||
|
}
|
||||||
|
// Initialize CSV input type
|
||||||
|
s3s, err = csv.New(options)
|
||||||
} else if req.InputSerialization.JSON != nil {
|
} else if req.InputSerialization.JSON != nil {
|
||||||
// Initializating options for JSON
|
options := &json.Options{
|
||||||
s3s, err = json.New(&json.Options{
|
Name: "S3Object", // Default table name for all objects
|
||||||
Name: "S3Object", // Default table name for all objects
|
ReadFrom: reader,
|
||||||
ReadFrom: reader,
|
Compressed: string(req.InputSerialization.CompressionType),
|
||||||
Compressed: string(req.InputSerialization.CompressionType),
|
Expression: cleanExpr(req.Expression),
|
||||||
Expression: cleanExpr(req.Expression),
|
StreamSize: size,
|
||||||
StreamSize: size,
|
DocumentType: req.InputSerialization.JSON.Type == JSONTypeDocument,
|
||||||
Type: req.InputSerialization.JSON.Type == JSONTypeDocument,
|
Progress: req.RequestProgress.Enabled,
|
||||||
Progress: req.RequestProgress.Enabled,
|
}
|
||||||
})
|
if req.OutputSerialization.JSON != nil {
|
||||||
|
options.OutputRecordDelimiter = req.OutputSerialization.JSON.RecordDelimiter
|
||||||
|
options.OutputType = format.JSON
|
||||||
|
}
|
||||||
|
if req.OutputSerialization.CSV != nil {
|
||||||
|
options.OutputFieldDelimiter = req.OutputSerialization.CSV.FieldDelimiter
|
||||||
|
options.OutputRecordDelimiter = req.OutputSerialization.CSV.RecordDelimiter
|
||||||
|
options.OutputType = format.CSV
|
||||||
|
}
|
||||||
|
// Initialize JSON input type
|
||||||
|
s3s, err = json.New(options)
|
||||||
}
|
}
|
||||||
return s3s, err
|
return s3s, err
|
||||||
}
|
}
|
||||||
|
@ -201,14 +201,19 @@ func processSelectReq(reqColNames []string, alias string, wc sqlparser.Expr, lre
|
|||||||
lrecords = math.MaxInt64
|
lrecords = math.MaxInt64
|
||||||
}
|
}
|
||||||
|
|
||||||
columnsKv, err := columnsIndex(reqColNames, f)
|
var results []string
|
||||||
if err != nil {
|
var columnsKv []columnKv
|
||||||
rowCh <- Row{
|
if f.Type() == format.CSV {
|
||||||
err: err,
|
var err error
|
||||||
|
columnsKv, err = columnsIndex(reqColNames, f)
|
||||||
|
if err != nil {
|
||||||
|
rowCh <- Row{
|
||||||
|
err: err,
|
||||||
|
}
|
||||||
|
return
|
||||||
}
|
}
|
||||||
return
|
results = make([]string, len(columnsKv))
|
||||||
}
|
}
|
||||||
var results = make([]string, len(columnsKv))
|
|
||||||
|
|
||||||
for {
|
for {
|
||||||
record, err := f.Read()
|
record, err := f.Read()
|
||||||
@ -228,6 +233,19 @@ func processSelectReq(reqColNames []string, alias string, wc sqlparser.Expr, lre
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// For JSON multi-line input type columns needs
|
||||||
|
// to be handled for each record.
|
||||||
|
if f.Type() == format.JSON {
|
||||||
|
columnsKv, err = columnsIndex(reqColNames, f)
|
||||||
|
if err != nil {
|
||||||
|
rowCh <- Row{
|
||||||
|
err: err,
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
results = make([]string, len(columnsKv))
|
||||||
|
}
|
||||||
|
|
||||||
f.UpdateBytesProcessed(int64(len(record)))
|
f.UpdateBytesProcessed(int64(len(record)))
|
||||||
|
|
||||||
// Return in case the number of record reaches the LIMIT
|
// Return in case the number of record reaches the LIMIT
|
||||||
@ -250,17 +268,17 @@ func processSelectReq(reqColNames []string, alias string, wc sqlparser.Expr, lre
|
|||||||
if condition {
|
if condition {
|
||||||
// if its an asterix we just print everything in the row
|
// if its an asterix we just print everything in the row
|
||||||
if reqColNames[0] == "*" && fnNames[0] == "" {
|
if reqColNames[0] == "*" && fnNames[0] == "" {
|
||||||
switch f.Type() {
|
switch f.OutputType() {
|
||||||
case format.CSV:
|
case format.CSV:
|
||||||
for i, kv := range columnsKv {
|
for i, kv := range columnsKv {
|
||||||
results[i] = gjson.GetBytes(record, kv.Key).String()
|
results[i] = gjson.GetBytes(record, kv.Key).String()
|
||||||
}
|
}
|
||||||
rowCh <- Row{
|
rowCh <- Row{
|
||||||
record: strings.Join(results, f.OutputFieldDelimiter()) + "\n",
|
record: strings.Join(results, f.OutputFieldDelimiter()) + f.OutputRecordDelimiter(),
|
||||||
}
|
}
|
||||||
case format.JSON:
|
case format.JSON:
|
||||||
rowCh <- Row{
|
rowCh <- Row{
|
||||||
record: string(record) + "\n",
|
record: string(record) + f.OutputRecordDelimiter(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if alias != "" {
|
} else if alias != "" {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user