mirror of
https://github.com/minio/minio.git
synced 2024-12-24 22:25:54 -05:00
s3select should honour custom record delimiter (#6419)
Allow custom delimiters like `\r\n`, `a`, `\r` etc in input csv and replace with `\n`. Fixes #6403
This commit is contained in:
parent
92bc7caf7a
commit
30d4a2cf53
@ -178,6 +178,10 @@ func (api objectAPIHandlers) SelectObjectContentHandler(w http.ResponseWriter, r
|
|||||||
writeErrorResponse(w, ErrInvalidQuoteFields, r.URL)
|
writeErrorResponse(w, ErrInvalidQuoteFields, r.URL)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
if len(selectReq.InputSerialization.CSV.RecordDelimiter) > 2 {
|
||||||
|
writeErrorResponse(w, ErrInvalidRequestParameter, r.URL)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
getObject := objectAPI.GetObject
|
getObject := objectAPI.GetObject
|
||||||
if api.CacheAPI() != nil && !crypto.SSEC.IsRequested(r.Header) {
|
if api.CacheAPI() != nil && !crypto.SSEC.IsRequested(r.Header) {
|
||||||
@ -222,9 +226,13 @@ func (api objectAPIHandlers) SelectObjectContentHandler(w http.ResponseWriter, r
|
|||||||
if selectReq.InputSerialization.CSV.FileHeaderInfo == "" {
|
if selectReq.InputSerialization.CSV.FileHeaderInfo == "" {
|
||||||
selectReq.InputSerialization.CSV.FileHeaderInfo = CSVFileHeaderInfoNone
|
selectReq.InputSerialization.CSV.FileHeaderInfo = CSVFileHeaderInfoNone
|
||||||
}
|
}
|
||||||
|
if selectReq.InputSerialization.CSV.RecordDelimiter == "" {
|
||||||
|
selectReq.InputSerialization.CSV.RecordDelimiter = "\n"
|
||||||
|
}
|
||||||
if selectReq.InputSerialization.CSV != nil {
|
if selectReq.InputSerialization.CSV != nil {
|
||||||
options := &s3select.Options{
|
options := &s3select.Options{
|
||||||
HasHeader: selectReq.InputSerialization.CSV.FileHeaderInfo != CSVFileHeaderInfoNone,
|
HasHeader: selectReq.InputSerialization.CSV.FileHeaderInfo != CSVFileHeaderInfoNone,
|
||||||
|
RecordDelimiter: selectReq.InputSerialization.CSV.RecordDelimiter,
|
||||||
FieldDelimiter: selectReq.InputSerialization.CSV.FieldDelimiter,
|
FieldDelimiter: selectReq.InputSerialization.CSV.FieldDelimiter,
|
||||||
Comments: selectReq.InputSerialization.CSV.Comments,
|
Comments: selectReq.InputSerialization.CSV.Comments,
|
||||||
Name: "S3Object", // Default table name for all objects
|
Name: "S3Object", // Default table name for all objects
|
||||||
|
87
pkg/ioutil/delimited-reader.go
Normal file
87
pkg/ioutil/delimited-reader.go
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
/*
|
||||||
|
* Minio Cloud Storage, (C) 2018 Minio, Inc.
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package ioutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"io"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
nByte byte = 10 // the byte that corresponds to the '\n' rune.
|
||||||
|
rByte byte = 13 // the byte that corresponds to the '\r' rune.
|
||||||
|
)
|
||||||
|
|
||||||
|
// DelimitedReader reduces the custom delimiter to `\n`.
|
||||||
|
type DelimitedReader struct {
|
||||||
|
r *bufio.Reader
|
||||||
|
delimiter []rune // Select can have upto 2 characters as delimiter.
|
||||||
|
assignEmpty bool // Decides whether the next read byte should be discarded.
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewDelimitedReader detects the custom delimiter and replaces with `\n`.
|
||||||
|
func NewDelimitedReader(r io.Reader, delimiter []rune) *DelimitedReader {
|
||||||
|
return &DelimitedReader{r: bufio.NewReader(r), delimiter: delimiter, assignEmpty: false}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reads and replaces the custom delimiter with `\n`.
|
||||||
|
func (r *DelimitedReader) Read(p []byte) (n int, err error) {
|
||||||
|
n, err = r.r.Read(p)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
for i, b := range p {
|
||||||
|
if r.assignEmpty {
|
||||||
|
swapAndNullify(p, i)
|
||||||
|
r.assignEmpty = false
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if b == rByte && rune(b) != r.delimiter[0] {
|
||||||
|
// Replace the carriage returns with `\n`.
|
||||||
|
// Mac styled csv will have `\r` as their record delimiter.
|
||||||
|
p[i] = nByte
|
||||||
|
} else if rune(b) == r.delimiter[0] { // Eg, `\r\n`,`ab`,`a` are valid delimiters
|
||||||
|
if i+1 == len(p) && len(r.delimiter) > 1 {
|
||||||
|
// If the first delimiter match falls on the boundary,
|
||||||
|
// Peek the next byte and if it matches, discard it in the next byte read.
|
||||||
|
if nextByte, nerr := r.r.Peek(1); nerr == nil {
|
||||||
|
if rune(nextByte[0]) == r.delimiter[1] {
|
||||||
|
p[i] = nByte
|
||||||
|
// To Discard in the next read.
|
||||||
|
r.assignEmpty = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if len(r.delimiter) > 1 && rune(p[i+1]) == r.delimiter[1] {
|
||||||
|
// The second delimiter falls in the same chunk.
|
||||||
|
p[i] = nByte
|
||||||
|
r.assignEmpty = true
|
||||||
|
} else if len(r.delimiter) == 1 {
|
||||||
|
// Replace with `\n` incase of single charecter delimiter match.
|
||||||
|
p[i] = nByte
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// Occupy the first byte space and nullify the last byte.
|
||||||
|
func swapAndNullify(p []byte, n int) {
|
||||||
|
for i := n; i < len(p)-1; i++ {
|
||||||
|
p[i] = p[i+1]
|
||||||
|
}
|
||||||
|
p[len(p)-1] = 0
|
||||||
|
}
|
83
pkg/ioutil/delimited-reader_test.go
Normal file
83
pkg/ioutil/delimited-reader_test.go
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
/*
|
||||||
|
* Minio Cloud Storage, (C) 2016, 2017, 2018 Minio, Inc.
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package ioutil
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"io"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Test for DelimitedCSVReader.
|
||||||
|
func TestDelimitedReader(t *testing.T) {
|
||||||
|
expected := "username,age\nbanana,12\ncarrot,23\napple,34\nbrinjal,90\nraddish,45"
|
||||||
|
|
||||||
|
inputs := []struct {
|
||||||
|
inputcsv string
|
||||||
|
delimiter string
|
||||||
|
chunkSize int
|
||||||
|
}{
|
||||||
|
// case 1 - with default `\n` delimiter.
|
||||||
|
{"username,age\nbanana,12\ncarrot,23\napple,34\nbrinjal,90\nraddish,45", "\n", 10},
|
||||||
|
// case 2 - with carriage return `\r` which should be replaced with `\n` by default.
|
||||||
|
{"username,age\rbanana,12\rcarrot,23\rapple,34\rbrinjal,90\rraddish,45", "\n", 10},
|
||||||
|
// case 3 - with a double character delimiter (octals).
|
||||||
|
{"username,age\r\nbanana,12\r\ncarrot,23\r\napple,34\r\nbrinjal,90\r\nraddish,45", "\r\n", 10},
|
||||||
|
// case 4 - with a double character delimiter.
|
||||||
|
{"username,agexvbanana,12xvcarrot,23xvapple,34xvbrinjal,90xvraddish,45", "xv", 10},
|
||||||
|
// case 5 - with a double character delimiter `\t `
|
||||||
|
{"username,age\t banana,12\t carrot,23\t apple,34\t brinjal,90\t raddish,45", "\t ", 10},
|
||||||
|
// case 6 - This is a special case where the first delimiter match falls in the 13'th byte space
|
||||||
|
// ie, the last byte space of the read chunk, In this case the reader should peek in the next byte
|
||||||
|
// and replace with `\n`.
|
||||||
|
{"username,agexxbanana,12xxcarrot,23xxapple,34xxbrinjal,90xxraddish,45", "xx", 13},
|
||||||
|
}
|
||||||
|
|
||||||
|
for c, input := range inputs {
|
||||||
|
var readcsv []byte
|
||||||
|
var err error
|
||||||
|
delimitedReader := NewDelimitedReader(strings.NewReader(input.inputcsv), []rune(input.delimiter))
|
||||||
|
for err == nil {
|
||||||
|
chunk := make([]byte, input.chunkSize)
|
||||||
|
_, err = delimitedReader.Read(chunk)
|
||||||
|
readcsv = append(readcsv, chunk...)
|
||||||
|
}
|
||||||
|
if err != io.EOF {
|
||||||
|
t.Fatalf("Case %d: Error in delimited read", c+1)
|
||||||
|
}
|
||||||
|
expected := []byte(expected)
|
||||||
|
cleanCsv := removeNulls(readcsv)
|
||||||
|
if !bytes.Equal(cleanCsv, expected) {
|
||||||
|
t.Fatalf("Case %d: Expected the delimited csv to be `%s`, but instead found `%s`", c+1, string(expected), string(cleanCsv))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Removes all the tailing nulls in chunks.
|
||||||
|
// Null chunks will be assigned if there is a reduction
|
||||||
|
// Eg, When `xv` is reduced to `\n`, the last byte is nullified.
|
||||||
|
func removeNulls(csv []byte) []byte {
|
||||||
|
cleanCsv := []byte{}
|
||||||
|
for _, p := range csv {
|
||||||
|
if p != 0 {
|
||||||
|
cleanCsv = append(cleanCsv, p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return cleanCsv
|
||||||
|
}
|
@ -29,6 +29,7 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
|
|
||||||
gzip "github.com/klauspost/pgzip"
|
gzip "github.com/klauspost/pgzip"
|
||||||
|
"github.com/minio/minio/pkg/ioutil"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -79,6 +80,9 @@ type Options struct {
|
|||||||
// HasHeader when true, will treat the first row as a header row.
|
// HasHeader when true, will treat the first row as a header row.
|
||||||
HasHeader bool
|
HasHeader bool
|
||||||
|
|
||||||
|
// RecordDelimiter is the string that records are delimited by.
|
||||||
|
RecordDelimiter string
|
||||||
|
|
||||||
// FieldDelimiter is the string that fields are delimited by.
|
// FieldDelimiter is the string that fields are delimited by.
|
||||||
FieldDelimiter string
|
FieldDelimiter string
|
||||||
|
|
||||||
@ -127,7 +131,8 @@ func NewInput(opts *Options) (*Input, error) {
|
|||||||
tempBytesScanned = opts.StreamSize
|
tempBytesScanned = opts.StreamSize
|
||||||
myReader = bzip2.NewReader(opts.ReadFrom)
|
myReader = bzip2.NewReader(opts.ReadFrom)
|
||||||
}
|
}
|
||||||
|
// DelimitedReader treats custom record delimiter like `\r\n`,`\r`,`ab` etc and replaces it with `\n`.
|
||||||
|
normalizedReader := ioutil.NewDelimitedReader(myReader, []rune(opts.RecordDelimiter))
|
||||||
progress := &statInfo{
|
progress := &statInfo{
|
||||||
BytesScanned: tempBytesScanned,
|
BytesScanned: tempBytesScanned,
|
||||||
BytesProcessed: 0,
|
BytesProcessed: 0,
|
||||||
@ -135,7 +140,7 @@ func NewInput(opts *Options) (*Input, error) {
|
|||||||
}
|
}
|
||||||
reader := &Input{
|
reader := &Input{
|
||||||
options: opts,
|
options: opts,
|
||||||
reader: csv.NewReader(myReader),
|
reader: csv.NewReader(normalizedReader),
|
||||||
stats: progress,
|
stats: progress,
|
||||||
}
|
}
|
||||||
reader.firstRow = nil
|
reader.firstRow = nil
|
||||||
|
@ -48,6 +48,7 @@ func TestCheckForDuplicates(t *testing.T) {
|
|||||||
func TestMyProcessing(t *testing.T) {
|
func TestMyProcessing(t *testing.T) {
|
||||||
options := &Options{
|
options := &Options{
|
||||||
HasHeader: false,
|
HasHeader: false,
|
||||||
|
RecordDelimiter: "\n",
|
||||||
FieldDelimiter: ",",
|
FieldDelimiter: ",",
|
||||||
Comments: "",
|
Comments: "",
|
||||||
Name: "S3Object", // Default table name for all objects
|
Name: "S3Object", // Default table name for all objects
|
||||||
@ -96,6 +97,7 @@ func TestMyProcessing(t *testing.T) {
|
|||||||
func TestMyRowIndexResults(t *testing.T) {
|
func TestMyRowIndexResults(t *testing.T) {
|
||||||
options := &Options{
|
options := &Options{
|
||||||
HasHeader: false,
|
HasHeader: false,
|
||||||
|
RecordDelimiter: "\n",
|
||||||
FieldDelimiter: ",",
|
FieldDelimiter: ",",
|
||||||
Comments: "",
|
Comments: "",
|
||||||
Name: "S3Object", // Default table name for all objects
|
Name: "S3Object", // Default table name for all objects
|
||||||
@ -271,6 +273,7 @@ func TestMyParser(t *testing.T) {
|
|||||||
for _, table := range tables {
|
for _, table := range tables {
|
||||||
options := &Options{
|
options := &Options{
|
||||||
HasHeader: false,
|
HasHeader: false,
|
||||||
|
RecordDelimiter: "\n",
|
||||||
FieldDelimiter: ",",
|
FieldDelimiter: ",",
|
||||||
Comments: "",
|
Comments: "",
|
||||||
Name: "S3Object", // Default table name for all objects
|
Name: "S3Object", // Default table name for all objects
|
||||||
@ -346,6 +349,7 @@ func TestMyAggregationFunc(t *testing.T) {
|
|||||||
func TestToStringAgg(t *testing.T) {
|
func TestToStringAgg(t *testing.T) {
|
||||||
options := &Options{
|
options := &Options{
|
||||||
HasHeader: false,
|
HasHeader: false,
|
||||||
|
RecordDelimiter: "\n",
|
||||||
FieldDelimiter: ",",
|
FieldDelimiter: ",",
|
||||||
Comments: "",
|
Comments: "",
|
||||||
Name: "S3Object", // Default table name for all objects
|
Name: "S3Object", // Default table name for all objects
|
||||||
@ -380,6 +384,7 @@ func TestToStringAgg(t *testing.T) {
|
|||||||
func TestMyRowColLiteralResults(t *testing.T) {
|
func TestMyRowColLiteralResults(t *testing.T) {
|
||||||
options := &Options{
|
options := &Options{
|
||||||
HasHeader: false,
|
HasHeader: false,
|
||||||
|
RecordDelimiter: "\n",
|
||||||
FieldDelimiter: ",",
|
FieldDelimiter: ",",
|
||||||
Comments: "",
|
Comments: "",
|
||||||
Name: "S3Object", // Default table name for all objects
|
Name: "S3Object", // Default table name for all objects
|
||||||
@ -459,6 +464,7 @@ func TestMyWhereEval(t *testing.T) {
|
|||||||
for _, table := range tables {
|
for _, table := range tables {
|
||||||
options := &Options{
|
options := &Options{
|
||||||
HasHeader: false,
|
HasHeader: false,
|
||||||
|
RecordDelimiter: "\n",
|
||||||
FieldDelimiter: ",",
|
FieldDelimiter: ",",
|
||||||
Comments: "",
|
Comments: "",
|
||||||
Name: "S3Object", // Default table name for all objects
|
Name: "S3Object", // Default table name for all objects
|
||||||
@ -610,6 +616,7 @@ func TestInterpreter(t *testing.T) {
|
|||||||
for _, table := range tables {
|
for _, table := range tables {
|
||||||
options := &Options{
|
options := &Options{
|
||||||
HasHeader: false,
|
HasHeader: false,
|
||||||
|
RecordDelimiter: "\n",
|
||||||
FieldDelimiter: ",",
|
FieldDelimiter: ",",
|
||||||
Comments: "",
|
Comments: "",
|
||||||
Name: "S3Object", // Default table name for all objects
|
Name: "S3Object", // Default table name for all objects
|
||||||
@ -651,6 +658,7 @@ func TestInterpreter(t *testing.T) {
|
|||||||
func TestMyXMLFunction(t *testing.T) {
|
func TestMyXMLFunction(t *testing.T) {
|
||||||
options := &Options{
|
options := &Options{
|
||||||
HasHeader: false,
|
HasHeader: false,
|
||||||
|
RecordDelimiter: "\n",
|
||||||
FieldDelimiter: ",",
|
FieldDelimiter: ",",
|
||||||
Comments: "",
|
Comments: "",
|
||||||
Name: "S3Object", // Default table name for all objects
|
Name: "S3Object", // Default table name for all objects
|
||||||
@ -689,6 +697,7 @@ func TestMyXMLFunction(t *testing.T) {
|
|||||||
func TestMyProtocolFunction(t *testing.T) {
|
func TestMyProtocolFunction(t *testing.T) {
|
||||||
options := &Options{
|
options := &Options{
|
||||||
HasHeader: false,
|
HasHeader: false,
|
||||||
|
RecordDelimiter: "\n",
|
||||||
FieldDelimiter: ",",
|
FieldDelimiter: ",",
|
||||||
Comments: "",
|
Comments: "",
|
||||||
Name: "S3Object", // Default table name for all objects
|
Name: "S3Object", // Default table name for all objects
|
||||||
@ -732,6 +741,7 @@ func TestMyProtocolFunction(t *testing.T) {
|
|||||||
func TestMyInfoProtocolFunctions(t *testing.T) {
|
func TestMyInfoProtocolFunctions(t *testing.T) {
|
||||||
options := &Options{
|
options := &Options{
|
||||||
HasHeader: true,
|
HasHeader: true,
|
||||||
|
RecordDelimiter: "\n",
|
||||||
FieldDelimiter: ",",
|
FieldDelimiter: ",",
|
||||||
Comments: "",
|
Comments: "",
|
||||||
Name: "S3Object", // Default table name for all objects
|
Name: "S3Object", // Default table name for all objects
|
||||||
@ -773,6 +783,7 @@ func TestMyInfoProtocolFunctions(t *testing.T) {
|
|||||||
func TestMyErrorProtocolFunctions(t *testing.T) {
|
func TestMyErrorProtocolFunctions(t *testing.T) {
|
||||||
options := &Options{
|
options := &Options{
|
||||||
HasHeader: false,
|
HasHeader: false,
|
||||||
|
RecordDelimiter: "\n",
|
||||||
FieldDelimiter: ",",
|
FieldDelimiter: ",",
|
||||||
Comments: "",
|
Comments: "",
|
||||||
Name: "S3Object", // Default table name for all objects
|
Name: "S3Object", // Default table name for all objects
|
||||||
@ -1007,6 +1018,7 @@ func TestMyValids(t *testing.T) {
|
|||||||
for _, table := range tables {
|
for _, table := range tables {
|
||||||
options := &Options{
|
options := &Options{
|
||||||
HasHeader: false,
|
HasHeader: false,
|
||||||
|
RecordDelimiter: "\n",
|
||||||
FieldDelimiter: ",",
|
FieldDelimiter: ",",
|
||||||
Comments: "",
|
Comments: "",
|
||||||
Name: "S3Object", // Default table name for all objects
|
Name: "S3Object", // Default table name for all objects
|
||||||
|
Loading…
Reference in New Issue
Block a user