Harshavardhana 85e939636f Fix JSON parser handling for certain objects (#7162)
This PR also adds some comments and simplifies
the code. Primary handling is done to ensure
that we make sure to honor cached buffer.

Added unit tests as well

Fixes #7141
2019-02-07 08:04:42 +05:30

555 lines
11 KiB
Go

package jstream
import (
"bytes"
"encoding/json"
"io"
"strconv"
"sync/atomic"
"unicode/utf16"
)
// ValueType - defines the type of each JSON value
type ValueType int
// Different types of JSON value
const (
Unknown ValueType = iota
Null
String
Number
Boolean
Array
Object
)
// MetaValue wraps a decoded interface value with the document
// position and depth at which the value was parsed
type MetaValue struct {
Offset int
Length int
Depth int
Value interface{}
ValueType ValueType
}
// KV contains a key and value pair parsed from a decoded object
type KV struct {
Key string `json:"key"`
Value interface{} `json:"value"`
}
// KVS - represents key values in an JSON object
type KVS []KV
// MarshalJSON - implements converting slice of KVS into a JSON object
// with multiple keys and values.
func (kvs KVS) MarshalJSON() ([]byte, error) {
b := new(bytes.Buffer)
b.Write([]byte("{"))
for i, kv := range kvs {
b.Write([]byte("\"" + kv.Key + "\"" + ":"))
valBuf, err := json.Marshal(kv.Value)
if err != nil {
return nil, err
}
b.Write(valBuf)
if i < len(kvs)-1 {
b.Write([]byte(","))
}
}
b.Write([]byte("}"))
return b.Bytes(), nil
}
// Decoder wraps an io.Reader to provide incremental decoding of
// JSON values
type Decoder struct {
*scanner
emitDepth int
emitKV bool
emitRecursive bool
depth int
scratch *scratch
metaCh chan *MetaValue
err error
// follow line position to add context to errors
lineNo int
lineStart int64
}
// NewDecoder creates new Decoder to read JSON values at the provided
// emitDepth from the provider io.Reader.
// If emitDepth is < 0, values at every depth will be emitted.
func NewDecoder(r io.Reader, emitDepth int) *Decoder {
d := &Decoder{
scanner: newScanner(r),
emitDepth: emitDepth,
scratch: &scratch{data: make([]byte, 1024)},
metaCh: make(chan *MetaValue, 128),
}
if emitDepth < 0 {
d.emitDepth = 0
d.emitRecursive = true
}
return d
}
// EmitKV enables emitting a jstream.KV struct when the items(s) parsed
// at configured emit depth are within a JSON object. By default, only
// the object values are emitted.
func (d *Decoder) EmitKV() *Decoder {
d.emitKV = true
return d
}
// Recursive enables emitting all values at a depth higher than the
// configured emit depth; e.g. if an array is found at emit depth, all
// values within the array are emitted to the stream, then the array
// containing those values is emitted.
func (d *Decoder) Recursive() *Decoder {
d.emitRecursive = true
return d
}
// Stream begins decoding from the underlying reader and returns a
// streaming MetaValue channel for JSON values at the configured emitDepth.
func (d *Decoder) Stream() chan *MetaValue {
go d.decode()
return d.metaCh
}
// Pos returns the number of bytes consumed from the underlying reader
func (d *Decoder) Pos() int { return int(d.pos) }
// Err returns the most recent decoder error if any, or nil
func (d *Decoder) Err() error { return d.err }
// Decode parses the JSON-encoded data and returns an interface value
func (d *Decoder) decode() {
defer close(d.metaCh)
d.skipSpaces()
for d.pos < atomic.LoadInt64(&d.end) {
_, err := d.emitAny()
if err != nil {
d.err = err
break
}
d.skipSpaces()
}
}
func (d *Decoder) emitAny() (interface{}, error) {
if d.pos >= atomic.LoadInt64(&d.end) {
return nil, d.mkError(ErrUnexpectedEOF)
}
offset := d.pos - 1
i, t, err := d.any()
if d.willEmit() {
d.metaCh <- &MetaValue{
Offset: int(offset),
Length: int(d.pos - offset),
Depth: d.depth,
Value: i,
ValueType: t,
}
}
return i, err
}
// return whether, at the current depth, the value being decoded will
// be emitted to stream
func (d *Decoder) willEmit() bool {
if d.emitRecursive {
return d.depth >= d.emitDepth
}
return d.depth == d.emitDepth
}
// any used to decode any valid JSON value, and returns an
// interface{} that holds the actual data
func (d *Decoder) any() (interface{}, ValueType, error) {
c := d.cur()
switch c {
case '"':
i, err := d.string()
return i, String, err
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
i, err := d.number()
return i, Number, err
case '-':
if c = d.next(); c < '0' && c > '9' {
return nil, Unknown, d.mkError(ErrSyntax, "in negative numeric literal")
}
n, err := d.number()
if err != nil {
return nil, Unknown, err
}
return -n, Number, nil
case 'f':
if d.remaining() < 4 {
return nil, Unknown, d.mkError(ErrUnexpectedEOF)
}
if d.next() == 'a' && d.next() == 'l' && d.next() == 's' && d.next() == 'e' {
return false, Boolean, nil
}
return nil, Unknown, d.mkError(ErrSyntax, "in literal false")
case 't':
if d.remaining() < 3 {
return nil, Unknown, d.mkError(ErrUnexpectedEOF)
}
if d.next() == 'r' && d.next() == 'u' && d.next() == 'e' {
return true, Boolean, nil
}
return nil, Unknown, d.mkError(ErrSyntax, "in literal true")
case 'n':
if d.remaining() < 3 {
return nil, Unknown, d.mkError(ErrUnexpectedEOF)
}
if d.next() == 'u' && d.next() == 'l' && d.next() == 'l' {
return nil, Null, nil
}
return nil, Unknown, d.mkError(ErrSyntax, "in literal null")
case '[':
i, err := d.array()
return i, Array, err
case '{':
i, err := d.object()
return i, Object, err
default:
return nil, Unknown, d.mkError(ErrSyntax, "looking for beginning of value")
}
}
// string called by `any` or `object`(for map keys) after reading `"`
func (d *Decoder) string() (string, error) {
d.scratch.reset()
var (
c = d.next()
)
scan:
for {
switch {
case c == '"':
return string(d.scratch.bytes()), nil
case c == '\\':
c = d.next()
goto scan_esc
case c < 0x20:
return "", d.mkError(ErrSyntax, "in string literal")
// Coerce to well-formed UTF-8.
default:
d.scratch.add(c)
if d.remaining() == 0 {
return "", d.mkError(ErrSyntax, "in string literal")
}
c = d.next()
}
}
scan_esc:
switch c {
case '"', '\\', '/', '\'':
d.scratch.add(c)
case 'u':
goto scan_u
case 'b':
d.scratch.add('\b')
case 'f':
d.scratch.add('\f')
case 'n':
d.scratch.add('\n')
case 'r':
d.scratch.add('\r')
case 't':
d.scratch.add('\t')
default:
return "", d.mkError(ErrSyntax, "in string escape code")
}
c = d.next()
goto scan
scan_u:
r := d.u4()
if r < 0 {
return "", d.mkError(ErrSyntax, "in unicode escape sequence")
}
// check for proceeding surrogate pair
c = d.next()
if !utf16.IsSurrogate(r) || c != '\\' {
d.scratch.addRune(r)
goto scan
}
if c = d.next(); c != 'u' {
d.scratch.addRune(r)
goto scan_esc
}
r2 := d.u4()
if r2 < 0 {
return "", d.mkError(ErrSyntax, "in unicode escape sequence")
}
// write surrogate pair
d.scratch.addRune(utf16.DecodeRune(r, r2))
c = d.next()
goto scan
}
// u4 reads four bytes following a \u escape
func (d *Decoder) u4() rune {
// logic taken from:
// github.com/buger/jsonparser/blob/master/escape.go#L20
var h [4]int
for i := 0; i < 4; i++ {
c := d.next()
switch {
case c >= '0' && c <= '9':
h[i] = int(c - '0')
case c >= 'A' && c <= 'F':
h[i] = int(c - 'A' + 10)
case c >= 'a' && c <= 'f':
h[i] = int(c - 'a' + 10)
default:
return -1
}
}
return rune(h[0]<<12 + h[1]<<8 + h[2]<<4 + h[3])
}
// number called by `any` after reading number between 0 to 9
func (d *Decoder) number() (float64, error) {
d.scratch.reset()
var (
c = d.cur()
n float64
isFloat bool
)
// digits first
switch {
case c == '0':
d.scratch.add(c)
c = d.next()
case '1' <= c && c <= '9':
for ; c >= '0' && c <= '9'; c = d.next() {
n = 10*n + float64(c-'0')
d.scratch.add(c)
}
}
// . followed by 1 or more digits
if c == '.' {
isFloat = true
d.scratch.add(c)
// first char following must be digit
if c = d.next(); c < '0' && c > '9' {
return 0, d.mkError(ErrSyntax, "after decimal point in numeric literal")
}
d.scratch.add(c)
for {
if d.remaining() == 0 {
return 0, d.mkError(ErrUnexpectedEOF)
}
if c = d.next(); c < '0' || c > '9' {
break
}
d.scratch.add(c)
}
}
// e or E followed by an optional - or + and
// 1 or more digits.
if c == 'e' || c == 'E' {
isFloat = true
d.scratch.add(c)
if c = d.next(); c == '+' || c == '-' {
d.scratch.add(c)
if c = d.next(); c < '0' || c > '9' {
return 0, d.mkError(ErrSyntax, "in exponent of numeric literal")
}
d.scratch.add(c)
}
for ; c >= '0' && c <= '9'; c = d.next() {
d.scratch.add(c)
}
}
if isFloat {
var (
err error
sn string
)
sn = string(d.scratch.bytes())
if n, err = strconv.ParseFloat(sn, 64); err != nil {
return 0, err
}
}
d.back()
return n, nil
}
// array accept valid JSON array value
func (d *Decoder) array() ([]interface{}, error) {
d.depth++
var (
c byte
v interface{}
err error
array = make([]interface{}, 0)
)
// look ahead for ] - if the array is empty.
if c = d.skipSpaces(); c == ']' {
goto out
}
scan:
if v, err = d.emitAny(); err != nil {
goto out
}
if d.depth > d.emitDepth { // skip alloc for array if it won't be emitted
array = append(array, v)
}
// next token must be ',' or ']'
switch c = d.skipSpaces(); c {
case ',':
d.skipSpaces()
goto scan
case ']':
goto out
default:
err = d.mkError(ErrSyntax, "after array element")
}
out:
d.depth--
return array, err
}
// object accept valid JSON array value
func (d *Decoder) object() (KVS, error) {
d.depth++
var (
c byte
k string
v interface{}
t ValueType
err error
obj KVS
)
// skip allocating map if it will not be emitted
if d.depth > d.emitDepth {
obj = make(KVS, 0)
}
// if the object has no keys
if c = d.skipSpaces(); c == '}' {
goto out
}
scan:
for {
offset := d.pos - 1
// read string key
if c != '"' {
err = d.mkError(ErrSyntax, "looking for beginning of object key string")
break
}
if k, err = d.string(); err != nil {
break
}
// read colon before value
if c = d.skipSpaces(); c != ':' {
err = d.mkError(ErrSyntax, "after object key")
break
}
// read value
d.skipSpaces()
if d.emitKV {
if v, t, err = d.any(); err != nil {
break
}
if d.willEmit() {
d.metaCh <- &MetaValue{
Offset: int(offset),
Length: int(d.pos - offset),
Depth: d.depth,
Value: KV{k, v},
ValueType: t,
}
}
} else {
if v, err = d.emitAny(); err != nil {
break
}
}
if obj != nil {
obj = append(obj, KV{k, v})
}
// next token must be ',' or '}'
switch c = d.skipSpaces(); c {
case '}':
goto out
case ',':
c = d.skipSpaces()
goto scan
default:
err = d.mkError(ErrSyntax, "after object key:value pair")
}
}
out:
d.depth--
return obj, err
}
// returns the next char after white spaces
func (d *Decoder) skipSpaces() byte {
for d.pos < atomic.LoadInt64(&d.end) {
switch c := d.next(); c {
case '\n':
d.lineStart = d.pos
d.lineNo++
continue
case ' ', '\t', '\r':
continue
default:
return c
}
}
return 0
}
// create syntax errors at current position, with optional context
func (d *Decoder) mkError(err SyntaxError, context ...string) error {
if len(context) > 0 {
err.context = context[0]
}
err.atChar = d.cur()
err.pos[0] = d.lineNo + 1
err.pos[1] = int(d.pos - d.lineStart)
return err
}