mirror of
https://github.com/minio/minio.git
synced 2025-11-10 05:59:43 -05:00
SIMDJSON S3 select input (#8401)
This commit is contained in:
64
pkg/s3select/simdj/errors.go
Normal file
64
pkg/s3select/simdj/errors.go
Normal file
@@ -0,0 +1,64 @@
|
||||
/*
|
||||
* MinIO Cloud Storage, (C) 2019 MinIO, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package simdj
|
||||
|
||||
import "fmt"
|
||||
|
||||
type s3Error struct {
|
||||
code string
|
||||
message string
|
||||
statusCode int
|
||||
cause error
|
||||
}
|
||||
|
||||
func (err *s3Error) Cause() error {
|
||||
return err.cause
|
||||
}
|
||||
|
||||
func (err *s3Error) ErrorCode() string {
|
||||
return err.code
|
||||
}
|
||||
|
||||
func (err *s3Error) ErrorMessage() string {
|
||||
return err.message
|
||||
}
|
||||
|
||||
func (err *s3Error) HTTPStatusCode() int {
|
||||
return err.statusCode
|
||||
}
|
||||
|
||||
func (err *s3Error) Error() string {
|
||||
return err.message
|
||||
}
|
||||
|
||||
func errInvalidJSONType(err error) *s3Error {
|
||||
return &s3Error{
|
||||
code: "InvalidJsonType",
|
||||
message: "The JsonType is invalid. Only DOCUMENT and LINES are supported.",
|
||||
statusCode: 400,
|
||||
cause: err,
|
||||
}
|
||||
}
|
||||
|
||||
func errJSONParsingError(err error) *s3Error {
|
||||
return &s3Error{
|
||||
code: "JSONParsingError",
|
||||
message: fmt.Sprintf("Encountered an error parsing the JSON file: %v. Check the file and try again.", err),
|
||||
statusCode: 400,
|
||||
cause: err,
|
||||
}
|
||||
}
|
||||
187
pkg/s3select/simdj/reader.go
Normal file
187
pkg/s3select/simdj/reader.go
Normal file
@@ -0,0 +1,187 @@
|
||||
/*
|
||||
* MinIO Cloud Storage, (C) 2019 MinIO, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package simdj
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"sync"
|
||||
|
||||
"github.com/minio/minio/pkg/s3select/json"
|
||||
"github.com/minio/minio/pkg/s3select/sql"
|
||||
"github.com/minio/simdjson-go"
|
||||
)
|
||||
|
||||
// Reader - JSON record reader for S3Select.
|
||||
type Reader struct {
|
||||
args *json.ReaderArgs
|
||||
input chan simdjson.Stream
|
||||
decoded chan simdjson.Object
|
||||
|
||||
// err will only be returned after decoded has been closed.
|
||||
err *error
|
||||
readCloser io.ReadCloser
|
||||
|
||||
exitReader chan struct{}
|
||||
readerWg sync.WaitGroup
|
||||
}
|
||||
|
||||
// Read - reads single record.
|
||||
func (r *Reader) Read(dst sql.Record) (sql.Record, error) {
|
||||
v, ok := <-r.decoded
|
||||
if !ok {
|
||||
if r.err != nil && *r.err != nil {
|
||||
return nil, errJSONParsingError(*r.err)
|
||||
}
|
||||
return nil, io.EOF
|
||||
}
|
||||
dstRec, ok := dst.(*Record)
|
||||
if !ok {
|
||||
dstRec = &Record{}
|
||||
}
|
||||
dstRec.object = v
|
||||
return dstRec, nil
|
||||
}
|
||||
|
||||
// Close - closes underlying reader.
|
||||
func (r *Reader) Close() error {
|
||||
// Close the input.
|
||||
// Potentially racy if the stream decoder is still reading.
|
||||
if r.readCloser != nil {
|
||||
r.readCloser.Close()
|
||||
}
|
||||
if r.exitReader != nil {
|
||||
close(r.exitReader)
|
||||
r.readerWg.Wait()
|
||||
r.exitReader = nil
|
||||
r.input = nil
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// startReader will start a reader that accepts input from r.input.
|
||||
// Input should be root -> object input. Each root indicates a record.
|
||||
// If r.input is closed, it is assumed that no more input will come.
|
||||
// When this function returns r.readerWg will be decremented and r.decoded will be closed.
|
||||
// On errors, r.err will be set. This should only be accessed after r.decoded has been closed.
|
||||
func (r *Reader) startReader() {
|
||||
defer r.readerWg.Done()
|
||||
defer close(r.decoded)
|
||||
var tmpObj simdjson.Object
|
||||
for {
|
||||
var in simdjson.Stream
|
||||
select {
|
||||
case in = <-r.input:
|
||||
case <-r.exitReader:
|
||||
return
|
||||
}
|
||||
if in.Error != nil && in.Error != io.EOF {
|
||||
r.err = &in.Error
|
||||
return
|
||||
}
|
||||
if in.Value == nil {
|
||||
if in.Error == io.EOF {
|
||||
return
|
||||
}
|
||||
continue
|
||||
}
|
||||
i := in.Value.Iter()
|
||||
readloop:
|
||||
for {
|
||||
var next simdjson.Iter
|
||||
typ, err := i.AdvanceIter(&next)
|
||||
if err != nil {
|
||||
r.err = &err
|
||||
return
|
||||
}
|
||||
switch typ {
|
||||
case simdjson.TypeNone:
|
||||
break readloop
|
||||
case simdjson.TypeRoot:
|
||||
typ, obj, err := next.Root(nil)
|
||||
if err != nil {
|
||||
r.err = &err
|
||||
return
|
||||
}
|
||||
if typ != simdjson.TypeObject {
|
||||
if typ == simdjson.TypeNone {
|
||||
continue
|
||||
}
|
||||
err = fmt.Errorf("unexpected json type below root :%v", typ)
|
||||
r.err = &err
|
||||
return
|
||||
}
|
||||
|
||||
o, err := obj.Object(&tmpObj)
|
||||
if err != nil {
|
||||
r.err = &err
|
||||
return
|
||||
}
|
||||
select {
|
||||
case <-r.exitReader:
|
||||
return
|
||||
case r.decoded <- *o:
|
||||
}
|
||||
default:
|
||||
err = fmt.Errorf("unexpected root json type:%v", typ)
|
||||
r.err = &err
|
||||
return
|
||||
}
|
||||
}
|
||||
if in.Error == io.EOF {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// NewReader - creates new JSON reader using readCloser.
|
||||
func NewReader(readCloser io.ReadCloser, args *json.ReaderArgs) *Reader {
|
||||
r := Reader{
|
||||
args: args,
|
||||
readCloser: readCloser,
|
||||
decoded: make(chan simdjson.Object, 1000),
|
||||
input: make(chan simdjson.Stream, 2),
|
||||
exitReader: make(chan struct{}),
|
||||
}
|
||||
simdjson.ParseNDStream(readCloser, r.input, nil)
|
||||
r.readerWg.Add(1)
|
||||
go r.startReader()
|
||||
return &r
|
||||
}
|
||||
|
||||
// NewElementReader - creates new JSON reader using readCloser.
|
||||
func NewElementReader(ch chan simdjson.Object, err *error, args *json.ReaderArgs) *Reader {
|
||||
return &Reader{
|
||||
args: args,
|
||||
decoded: ch,
|
||||
err: err,
|
||||
readCloser: nil,
|
||||
}
|
||||
}
|
||||
|
||||
// NewTapeReaderChan will start a reader that will read input from the provided channel.
|
||||
func NewTapeReaderChan(pj chan simdjson.Stream, args *json.ReaderArgs) *Reader {
|
||||
r := Reader{
|
||||
args: args,
|
||||
decoded: make(chan simdjson.Object, 1000),
|
||||
input: pj,
|
||||
exitReader: make(chan struct{}),
|
||||
}
|
||||
r.readerWg.Add(1)
|
||||
go r.startReader()
|
||||
return &r
|
||||
}
|
||||
165
pkg/s3select/simdj/reader_test.go
Normal file
165
pkg/s3select/simdj/reader_test.go
Normal file
@@ -0,0 +1,165 @@
|
||||
/*
|
||||
* MinIO Cloud Storage, (C) 2019 MinIO, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package simdj
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/klauspost/compress/zstd"
|
||||
"github.com/minio/minio/pkg/s3select/json"
|
||||
"github.com/minio/simdjson-go"
|
||||
)
|
||||
|
||||
type tester interface {
|
||||
Fatal(args ...interface{})
|
||||
}
|
||||
|
||||
func loadCompressed(t tester, file string) (js []byte) {
|
||||
dec, err := zstd.NewReader(nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer dec.Close()
|
||||
js, err = ioutil.ReadFile(filepath.Join("testdata", file+".json.zst"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
js, err = dec.DecodeAll(js, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
return js
|
||||
}
|
||||
|
||||
var testCases = []struct {
|
||||
name string
|
||||
array bool
|
||||
}{
|
||||
{
|
||||
name: "parking-citations-10",
|
||||
},
|
||||
}
|
||||
|
||||
func TestNDJSON(t *testing.T) {
|
||||
for _, tt := range testCases {
|
||||
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
ref := loadCompressed(t, tt.name)
|
||||
|
||||
var err error
|
||||
dst := make(chan simdjson.Object, 100)
|
||||
dec := NewElementReader(dst, &err, &json.ReaderArgs{ContentType: "json"})
|
||||
pj, err := simdjson.ParseND(ref, nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
i := pj.Iter()
|
||||
cpy := i
|
||||
b, err := cpy.MarshalJSON()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if false {
|
||||
t.Log(string(b))
|
||||
}
|
||||
//_ = ioutil.WriteFile(filepath.Join("testdata", tt.name+".json"), b, os.ModePerm)
|
||||
|
||||
parser:
|
||||
for {
|
||||
var next simdjson.Iter
|
||||
typ, err := i.AdvanceIter(&next)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
switch typ {
|
||||
case simdjson.TypeNone:
|
||||
close(dst)
|
||||
break parser
|
||||
case simdjson.TypeRoot:
|
||||
typ, obj, err := next.Root(nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if typ != simdjson.TypeObject {
|
||||
if typ == simdjson.TypeNone {
|
||||
close(dst)
|
||||
break parser
|
||||
}
|
||||
t.Fatal("Unexpected type:", typ.String())
|
||||
}
|
||||
|
||||
o, err := obj.Object(nil)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
dst <- *o
|
||||
default:
|
||||
t.Fatal("unexpected type:", typ.String())
|
||||
}
|
||||
}
|
||||
refDec := json.NewReader(ioutil.NopCloser(bytes.NewBuffer(ref)), &json.ReaderArgs{ContentType: "json"})
|
||||
|
||||
for {
|
||||
rec, err := dec.Read(nil)
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
want, err := refDec.Read(nil)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
var gotB, wantB bytes.Buffer
|
||||
err = rec.WriteCSV(&gotB, ',')
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
err = want.WriteCSV(&wantB, ',')
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
|
||||
if !bytes.Equal(gotB.Bytes(), wantB.Bytes()) {
|
||||
t.Errorf("CSV output mismatch.\nwant: %s(%x)\ngot: %s(%x)", wantB.String(), wantB.Bytes(), gotB.String(), gotB.Bytes())
|
||||
}
|
||||
gotB.Reset()
|
||||
wantB.Reset()
|
||||
|
||||
err = rec.WriteJSON(&gotB)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
err = want.WriteJSON(&wantB)
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
// truncate newline from 'want'
|
||||
wantB.Truncate(wantB.Len() - 1)
|
||||
if !bytes.Equal(gotB.Bytes(), wantB.Bytes()) {
|
||||
t.Errorf("JSON output mismatch.\nwant: %s\ngot: %s", wantB.String(), gotB.String())
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
228
pkg/s3select/simdj/record.go
Normal file
228
pkg/s3select/simdj/record.go
Normal file
@@ -0,0 +1,228 @@
|
||||
/*
|
||||
* MinIO Cloud Storage, (C) 2019 MinIO, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package simdj
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"fmt"
|
||||
"io"
|
||||
|
||||
"github.com/bcicen/jstream"
|
||||
"github.com/minio/minio/pkg/s3select/json"
|
||||
"github.com/minio/minio/pkg/s3select/sql"
|
||||
"github.com/minio/simdjson-go"
|
||||
)
|
||||
|
||||
// Record - is JSON record.
|
||||
type Record struct {
|
||||
// object
|
||||
object simdjson.Object
|
||||
}
|
||||
|
||||
// Get - gets the value for a column name.
|
||||
func (r *Record) Get(name string) (*sql.Value, error) {
|
||||
elem := r.object.FindKey(name, nil)
|
||||
if elem == nil {
|
||||
return nil, nil
|
||||
}
|
||||
return iterToValue(elem.Iter)
|
||||
}
|
||||
|
||||
func iterToValue(iter simdjson.Iter) (*sql.Value, error) {
|
||||
switch iter.Type() {
|
||||
case simdjson.TypeString:
|
||||
v, err := iter.String()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return sql.FromString(v), nil
|
||||
case simdjson.TypeFloat:
|
||||
v, err := iter.Float()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return sql.FromFloat(v), nil
|
||||
case simdjson.TypeInt:
|
||||
v, err := iter.Int()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return sql.FromInt(v), nil
|
||||
case simdjson.TypeUint:
|
||||
v, err := iter.Int()
|
||||
if err != nil {
|
||||
// Can't fit into int, convert to float.
|
||||
v, err := iter.Float()
|
||||
return sql.FromFloat(v), err
|
||||
}
|
||||
return sql.FromInt(v), nil
|
||||
case simdjson.TypeBool:
|
||||
v, err := iter.Bool()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return sql.FromBool(v), nil
|
||||
case simdjson.TypeNull:
|
||||
return sql.FromNull(), nil
|
||||
case simdjson.TypeObject, simdjson.TypeArray:
|
||||
b, err := iter.MarshalJSON()
|
||||
return sql.FromBytes(b), err
|
||||
}
|
||||
return nil, fmt.Errorf("iterToValue: unknown JSON type: %s", iter.Type().String())
|
||||
}
|
||||
|
||||
// Reset the record.
|
||||
func (r *Record) Reset() {
|
||||
r.object = simdjson.Object{}
|
||||
}
|
||||
|
||||
// Clone the record and if possible use the destination provided.
|
||||
func (r *Record) Clone(dst sql.Record) sql.Record {
|
||||
other, ok := dst.(*Record)
|
||||
if !ok {
|
||||
other = &Record{}
|
||||
}
|
||||
other.object = r.object
|
||||
return other
|
||||
}
|
||||
|
||||
// CloneTo clones the record to a json Record.
|
||||
// Values are only unmashaled on object level.
|
||||
func (r *Record) CloneTo(dst *json.Record) (sql.Record, error) {
|
||||
if dst == nil {
|
||||
dst = &json.Record{SelectFormat: sql.SelectFmtJSON}
|
||||
}
|
||||
dst.Reset()
|
||||
elems, err := r.object.Parse(nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if cap(dst.KVS) < len(elems.Elements) {
|
||||
dst.KVS = make(jstream.KVS, 0, len(elems.Elements))
|
||||
}
|
||||
for _, elem := range elems.Elements {
|
||||
v, err := sql.IterToValue(elem.Iter)
|
||||
if err != nil {
|
||||
v, err = elem.Iter.Interface()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
dst.KVS = append(dst.KVS, jstream.KV{
|
||||
Key: elem.Name,
|
||||
Value: v,
|
||||
})
|
||||
}
|
||||
return dst, nil
|
||||
}
|
||||
|
||||
// Set - sets the value for a column name.
|
||||
func (r *Record) Set(name string, value *sql.Value) (sql.Record, error) {
|
||||
dst, err := r.CloneTo(nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return dst.Set(name, value)
|
||||
}
|
||||
|
||||
// WriteCSV - encodes to CSV data.
|
||||
func (r *Record) WriteCSV(writer io.Writer, fieldDelimiter rune) error {
|
||||
csvRecord := make([]string, 0, 10)
|
||||
var tmp simdjson.Iter
|
||||
obj := r.object
|
||||
allElems:
|
||||
for {
|
||||
_, typ, err := obj.NextElement(&tmp)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
var columnValue string
|
||||
switch typ {
|
||||
case simdjson.TypeNull, simdjson.TypeFloat, simdjson.TypeUint, simdjson.TypeInt, simdjson.TypeBool, simdjson.TypeString:
|
||||
val, err := tmp.StringCvt()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
columnValue = val
|
||||
case simdjson.TypeObject, simdjson.TypeArray:
|
||||
b, err := tmp.MarshalJSON()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
columnValue = string(b)
|
||||
case simdjson.TypeNone:
|
||||
break allElems
|
||||
default:
|
||||
return fmt.Errorf("cannot marshal unhandled type: %s", typ.String())
|
||||
}
|
||||
csvRecord = append(csvRecord, columnValue)
|
||||
}
|
||||
w := csv.NewWriter(writer)
|
||||
w.Comma = fieldDelimiter
|
||||
if err := w.Write(csvRecord); err != nil {
|
||||
return err
|
||||
}
|
||||
w.Flush()
|
||||
if err := w.Error(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Raw - returns the underlying representation.
|
||||
func (r *Record) Raw() (sql.SelectObjectFormat, interface{}) {
|
||||
return sql.SelectFmtSIMDJSON, r.object
|
||||
}
|
||||
|
||||
// WriteJSON - encodes to JSON data.
|
||||
func (r *Record) WriteJSON(writer io.Writer) error {
|
||||
o := r.object
|
||||
elems, err := o.Parse(nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
b, err := elems.MarshalJSON()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
n, err := writer.Write(b)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if n != len(b) {
|
||||
return io.ErrShortWrite
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Replace the underlying buffer of json data.
|
||||
func (r *Record) Replace(k interface{}) error {
|
||||
v, ok := k.(simdjson.Object)
|
||||
if !ok {
|
||||
return fmt.Errorf("cannot replace internal data in simd json record with type %T", k)
|
||||
}
|
||||
r.object = v
|
||||
return nil
|
||||
}
|
||||
|
||||
// NewRecord - creates new empty JSON record.
|
||||
func NewRecord(f sql.SelectObjectFormat, obj simdjson.Object) *Record {
|
||||
return &Record{
|
||||
object: obj,
|
||||
}
|
||||
}
|
||||
BIN
pkg/s3select/simdj/testdata/parking-citations-10.json.zst
vendored
Normal file
BIN
pkg/s3select/simdj/testdata/parking-citations-10.json.zst
vendored
Normal file
Binary file not shown.
Reference in New Issue
Block a user