SIMDJSON S3 select input (#8401)

This commit is contained in:
Klaus Post
2020-02-13 14:03:52 -08:00
committed by GitHub
parent d1144c2c7e
commit e4020fb41f
16 changed files with 1116 additions and 73 deletions

View File

@@ -0,0 +1,64 @@
/*
* MinIO Cloud Storage, (C) 2019 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package simdj
import "fmt"
type s3Error struct {
code string
message string
statusCode int
cause error
}
func (err *s3Error) Cause() error {
return err.cause
}
func (err *s3Error) ErrorCode() string {
return err.code
}
func (err *s3Error) ErrorMessage() string {
return err.message
}
func (err *s3Error) HTTPStatusCode() int {
return err.statusCode
}
func (err *s3Error) Error() string {
return err.message
}
func errInvalidJSONType(err error) *s3Error {
return &s3Error{
code: "InvalidJsonType",
message: "The JsonType is invalid. Only DOCUMENT and LINES are supported.",
statusCode: 400,
cause: err,
}
}
func errJSONParsingError(err error) *s3Error {
return &s3Error{
code: "JSONParsingError",
message: fmt.Sprintf("Encountered an error parsing the JSON file: %v. Check the file and try again.", err),
statusCode: 400,
cause: err,
}
}

View File

@@ -0,0 +1,187 @@
/*
* MinIO Cloud Storage, (C) 2019 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package simdj
import (
"fmt"
"io"
"sync"
"github.com/minio/minio/pkg/s3select/json"
"github.com/minio/minio/pkg/s3select/sql"
"github.com/minio/simdjson-go"
)
// Reader - JSON record reader for S3Select.
type Reader struct {
args *json.ReaderArgs
input chan simdjson.Stream
decoded chan simdjson.Object
// err will only be returned after decoded has been closed.
err *error
readCloser io.ReadCloser
exitReader chan struct{}
readerWg sync.WaitGroup
}
// Read - reads single record.
func (r *Reader) Read(dst sql.Record) (sql.Record, error) {
v, ok := <-r.decoded
if !ok {
if r.err != nil && *r.err != nil {
return nil, errJSONParsingError(*r.err)
}
return nil, io.EOF
}
dstRec, ok := dst.(*Record)
if !ok {
dstRec = &Record{}
}
dstRec.object = v
return dstRec, nil
}
// Close - closes underlying reader.
func (r *Reader) Close() error {
// Close the input.
// Potentially racy if the stream decoder is still reading.
if r.readCloser != nil {
r.readCloser.Close()
}
if r.exitReader != nil {
close(r.exitReader)
r.readerWg.Wait()
r.exitReader = nil
r.input = nil
}
return nil
}
// startReader will start a reader that accepts input from r.input.
// Input should be root -> object input. Each root indicates a record.
// If r.input is closed, it is assumed that no more input will come.
// When this function returns r.readerWg will be decremented and r.decoded will be closed.
// On errors, r.err will be set. This should only be accessed after r.decoded has been closed.
func (r *Reader) startReader() {
defer r.readerWg.Done()
defer close(r.decoded)
var tmpObj simdjson.Object
for {
var in simdjson.Stream
select {
case in = <-r.input:
case <-r.exitReader:
return
}
if in.Error != nil && in.Error != io.EOF {
r.err = &in.Error
return
}
if in.Value == nil {
if in.Error == io.EOF {
return
}
continue
}
i := in.Value.Iter()
readloop:
for {
var next simdjson.Iter
typ, err := i.AdvanceIter(&next)
if err != nil {
r.err = &err
return
}
switch typ {
case simdjson.TypeNone:
break readloop
case simdjson.TypeRoot:
typ, obj, err := next.Root(nil)
if err != nil {
r.err = &err
return
}
if typ != simdjson.TypeObject {
if typ == simdjson.TypeNone {
continue
}
err = fmt.Errorf("unexpected json type below root :%v", typ)
r.err = &err
return
}
o, err := obj.Object(&tmpObj)
if err != nil {
r.err = &err
return
}
select {
case <-r.exitReader:
return
case r.decoded <- *o:
}
default:
err = fmt.Errorf("unexpected root json type:%v", typ)
r.err = &err
return
}
}
if in.Error == io.EOF {
return
}
}
}
// NewReader - creates new JSON reader using readCloser.
func NewReader(readCloser io.ReadCloser, args *json.ReaderArgs) *Reader {
r := Reader{
args: args,
readCloser: readCloser,
decoded: make(chan simdjson.Object, 1000),
input: make(chan simdjson.Stream, 2),
exitReader: make(chan struct{}),
}
simdjson.ParseNDStream(readCloser, r.input, nil)
r.readerWg.Add(1)
go r.startReader()
return &r
}
// NewElementReader - creates new JSON reader using readCloser.
func NewElementReader(ch chan simdjson.Object, err *error, args *json.ReaderArgs) *Reader {
return &Reader{
args: args,
decoded: ch,
err: err,
readCloser: nil,
}
}
// NewTapeReaderChan will start a reader that will read input from the provided channel.
func NewTapeReaderChan(pj chan simdjson.Stream, args *json.ReaderArgs) *Reader {
r := Reader{
args: args,
decoded: make(chan simdjson.Object, 1000),
input: pj,
exitReader: make(chan struct{}),
}
r.readerWg.Add(1)
go r.startReader()
return &r
}

View File

@@ -0,0 +1,165 @@
/*
* MinIO Cloud Storage, (C) 2019 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package simdj
import (
"bytes"
"io"
"io/ioutil"
"path/filepath"
"testing"
"github.com/klauspost/compress/zstd"
"github.com/minio/minio/pkg/s3select/json"
"github.com/minio/simdjson-go"
)
type tester interface {
Fatal(args ...interface{})
}
func loadCompressed(t tester, file string) (js []byte) {
dec, err := zstd.NewReader(nil)
if err != nil {
t.Fatal(err)
}
defer dec.Close()
js, err = ioutil.ReadFile(filepath.Join("testdata", file+".json.zst"))
if err != nil {
t.Fatal(err)
}
js, err = dec.DecodeAll(js, nil)
if err != nil {
t.Fatal(err)
}
return js
}
var testCases = []struct {
name string
array bool
}{
{
name: "parking-citations-10",
},
}
func TestNDJSON(t *testing.T) {
for _, tt := range testCases {
t.Run(tt.name, func(t *testing.T) {
ref := loadCompressed(t, tt.name)
var err error
dst := make(chan simdjson.Object, 100)
dec := NewElementReader(dst, &err, &json.ReaderArgs{ContentType: "json"})
pj, err := simdjson.ParseND(ref, nil)
if err != nil {
t.Fatal(err)
}
i := pj.Iter()
cpy := i
b, err := cpy.MarshalJSON()
if err != nil {
t.Fatal(err)
}
if false {
t.Log(string(b))
}
//_ = ioutil.WriteFile(filepath.Join("testdata", tt.name+".json"), b, os.ModePerm)
parser:
for {
var next simdjson.Iter
typ, err := i.AdvanceIter(&next)
if err != nil {
t.Fatal(err)
}
switch typ {
case simdjson.TypeNone:
close(dst)
break parser
case simdjson.TypeRoot:
typ, obj, err := next.Root(nil)
if err != nil {
t.Fatal(err)
}
if typ != simdjson.TypeObject {
if typ == simdjson.TypeNone {
close(dst)
break parser
}
t.Fatal("Unexpected type:", typ.String())
}
o, err := obj.Object(nil)
if err != nil {
t.Fatal(err)
}
dst <- *o
default:
t.Fatal("unexpected type:", typ.String())
}
}
refDec := json.NewReader(ioutil.NopCloser(bytes.NewBuffer(ref)), &json.ReaderArgs{ContentType: "json"})
for {
rec, err := dec.Read(nil)
if err == io.EOF {
break
}
if err != nil {
t.Error(err)
}
want, err := refDec.Read(nil)
if err != nil {
t.Error(err)
}
var gotB, wantB bytes.Buffer
err = rec.WriteCSV(&gotB, ',')
if err != nil {
t.Error(err)
}
err = want.WriteCSV(&wantB, ',')
if err != nil {
t.Error(err)
}
if !bytes.Equal(gotB.Bytes(), wantB.Bytes()) {
t.Errorf("CSV output mismatch.\nwant: %s(%x)\ngot: %s(%x)", wantB.String(), wantB.Bytes(), gotB.String(), gotB.Bytes())
}
gotB.Reset()
wantB.Reset()
err = rec.WriteJSON(&gotB)
if err != nil {
t.Error(err)
}
err = want.WriteJSON(&wantB)
if err != nil {
t.Error(err)
}
// truncate newline from 'want'
wantB.Truncate(wantB.Len() - 1)
if !bytes.Equal(gotB.Bytes(), wantB.Bytes()) {
t.Errorf("JSON output mismatch.\nwant: %s\ngot: %s", wantB.String(), gotB.String())
}
}
})
}
}

View File

@@ -0,0 +1,228 @@
/*
* MinIO Cloud Storage, (C) 2019 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package simdj
import (
"encoding/csv"
"fmt"
"io"
"github.com/bcicen/jstream"
"github.com/minio/minio/pkg/s3select/json"
"github.com/minio/minio/pkg/s3select/sql"
"github.com/minio/simdjson-go"
)
// Record - is JSON record.
type Record struct {
// object
object simdjson.Object
}
// Get - gets the value for a column name.
func (r *Record) Get(name string) (*sql.Value, error) {
elem := r.object.FindKey(name, nil)
if elem == nil {
return nil, nil
}
return iterToValue(elem.Iter)
}
func iterToValue(iter simdjson.Iter) (*sql.Value, error) {
switch iter.Type() {
case simdjson.TypeString:
v, err := iter.String()
if err != nil {
return nil, err
}
return sql.FromString(v), nil
case simdjson.TypeFloat:
v, err := iter.Float()
if err != nil {
return nil, err
}
return sql.FromFloat(v), nil
case simdjson.TypeInt:
v, err := iter.Int()
if err != nil {
return nil, err
}
return sql.FromInt(v), nil
case simdjson.TypeUint:
v, err := iter.Int()
if err != nil {
// Can't fit into int, convert to float.
v, err := iter.Float()
return sql.FromFloat(v), err
}
return sql.FromInt(v), nil
case simdjson.TypeBool:
v, err := iter.Bool()
if err != nil {
return nil, err
}
return sql.FromBool(v), nil
case simdjson.TypeNull:
return sql.FromNull(), nil
case simdjson.TypeObject, simdjson.TypeArray:
b, err := iter.MarshalJSON()
return sql.FromBytes(b), err
}
return nil, fmt.Errorf("iterToValue: unknown JSON type: %s", iter.Type().String())
}
// Reset the record.
func (r *Record) Reset() {
r.object = simdjson.Object{}
}
// Clone the record and if possible use the destination provided.
func (r *Record) Clone(dst sql.Record) sql.Record {
other, ok := dst.(*Record)
if !ok {
other = &Record{}
}
other.object = r.object
return other
}
// CloneTo clones the record to a json Record.
// Values are only unmashaled on object level.
func (r *Record) CloneTo(dst *json.Record) (sql.Record, error) {
if dst == nil {
dst = &json.Record{SelectFormat: sql.SelectFmtJSON}
}
dst.Reset()
elems, err := r.object.Parse(nil)
if err != nil {
return nil, err
}
if cap(dst.KVS) < len(elems.Elements) {
dst.KVS = make(jstream.KVS, 0, len(elems.Elements))
}
for _, elem := range elems.Elements {
v, err := sql.IterToValue(elem.Iter)
if err != nil {
v, err = elem.Iter.Interface()
if err != nil {
panic(err)
}
}
dst.KVS = append(dst.KVS, jstream.KV{
Key: elem.Name,
Value: v,
})
}
return dst, nil
}
// Set - sets the value for a column name.
func (r *Record) Set(name string, value *sql.Value) (sql.Record, error) {
dst, err := r.CloneTo(nil)
if err != nil {
return nil, err
}
return dst.Set(name, value)
}
// WriteCSV - encodes to CSV data.
func (r *Record) WriteCSV(writer io.Writer, fieldDelimiter rune) error {
csvRecord := make([]string, 0, 10)
var tmp simdjson.Iter
obj := r.object
allElems:
for {
_, typ, err := obj.NextElement(&tmp)
if err != nil {
return err
}
var columnValue string
switch typ {
case simdjson.TypeNull, simdjson.TypeFloat, simdjson.TypeUint, simdjson.TypeInt, simdjson.TypeBool, simdjson.TypeString:
val, err := tmp.StringCvt()
if err != nil {
return err
}
columnValue = val
case simdjson.TypeObject, simdjson.TypeArray:
b, err := tmp.MarshalJSON()
if err != nil {
return err
}
columnValue = string(b)
case simdjson.TypeNone:
break allElems
default:
return fmt.Errorf("cannot marshal unhandled type: %s", typ.String())
}
csvRecord = append(csvRecord, columnValue)
}
w := csv.NewWriter(writer)
w.Comma = fieldDelimiter
if err := w.Write(csvRecord); err != nil {
return err
}
w.Flush()
if err := w.Error(); err != nil {
return err
}
return nil
}
// Raw - returns the underlying representation.
func (r *Record) Raw() (sql.SelectObjectFormat, interface{}) {
return sql.SelectFmtSIMDJSON, r.object
}
// WriteJSON - encodes to JSON data.
func (r *Record) WriteJSON(writer io.Writer) error {
o := r.object
elems, err := o.Parse(nil)
if err != nil {
return err
}
b, err := elems.MarshalJSON()
if err != nil {
return err
}
n, err := writer.Write(b)
if err != nil {
return err
}
if n != len(b) {
return io.ErrShortWrite
}
return nil
}
// Replace the underlying buffer of json data.
func (r *Record) Replace(k interface{}) error {
v, ok := k.(simdjson.Object)
if !ok {
return fmt.Errorf("cannot replace internal data in simd json record with type %T", k)
}
r.object = v
return nil
}
// NewRecord - creates new empty JSON record.
func NewRecord(f sql.SelectObjectFormat, obj simdjson.Object) *Record {
return &Record{
object: obj,
}
}

Binary file not shown.