From e3eec89d246bdbf4d6ffddc7c65a5139c86f180b Mon Sep 17 00:00:00 2001 From: Aditya Manthramurthy Date: Tue, 9 Oct 2018 14:02:19 -0700 Subject: [PATCH] Optimize string processing in select (#6593) Reduce allocations during string concatenation and simplify some processing code. --- pkg/s3select/helpers.go | 82 ++++++++++++++--------------------------- pkg/s3select/select.go | 46 +++++++++-------------- 2 files changed, 45 insertions(+), 83 deletions(-) diff --git a/pkg/s3select/helpers.go b/pkg/s3select/helpers.go index 4d8ae7f5f..39bf29736 100644 --- a/pkg/s3select/helpers.go +++ b/pkg/s3select/helpers.go @@ -55,8 +55,8 @@ func stringInSlice(x string, list []string) bool { // This function returns the index of a string in a list func stringIndex(a string, list []string) int { - for i := range list { - if list[i] == a { + for i, v := range list { + if v == a { return i } } @@ -65,10 +65,8 @@ func stringIndex(a string, list []string) int { // Returns a true or false, whether a string can be represented as an int. func representsInt(s string) bool { - if _, err := strconv.Atoi(s); err == nil { - return true - } - return false + _, err := strconv.Atoi(s) + return err == nil } // The function below processes the where clause into an acutal boolean given a @@ -236,16 +234,16 @@ func checkValidOperator(operator string) error { } // checkStringType converts the value from the csv to the appropriate one. -func checkStringType(myTblVal string) interface{} { - myInt, isInt := strconv.Atoi(myTblVal) - myFloat, isFloat := strconv.ParseFloat(myTblVal, 64) - if isInt == nil { - return myInt - } else if isFloat == nil { - return myFloat - } else { - return myTblVal +func checkStringType(tblVal string) interface{} { + intVal, err := strconv.Atoi(tblVal) + if err == nil { + return intVal } + floatVal, err := strconv.ParseFloat(tblVal, 64) + if err == nil { + return floatVal + } + return tblVal } // stringEval is for evaluating the state of string comparison. @@ -627,31 +625,6 @@ func (reader *Input) whereClauseNameErrs(whereClause interface{}, alias string) return nil } -// qualityCheck ensures the row has enough separators. -func qualityCheck(row string, amountOfSep int, sep string) string { - for i := 0; i < amountOfSep; i++ { - row = row + sep - } - return row -} - -// writeRow helps to write the row regardless of how many entries. -func writeRow(myRow string, myEntry string, delimiter string, numOfReqCols int) string { - if myEntry == "" && len(myRow) == 0 && numOfReqCols == 1 { - return myEntry - } - if myEntry == "" && len(myRow) == 0 { - return myEntry + delimiter - } - if len(myRow) == 1 && myRow[0] == ',' { - return myRow + myEntry - } - if len(myRow) == 0 { - return myEntry - } - return myRow + delimiter + myEntry -} - // colNameErrs is a function which makes sure that the headers are requested are // present in the file otherwise it throws an error. func (reader *Input) colNameErrs(columnNames []string) error { @@ -677,24 +650,23 @@ func (reader *Input) colNameErrs(columnNames []string) error { } // aggFuncToStr converts an array of floats into a properly formatted string. -func (reader *Input) aggFuncToStr(myAggVals []float64) string { - var myRow string - var aggregateval string - if myAggVals[0] == math.Trunc(myAggVals[0]) { - myRow = strconv.FormatInt(int64(myAggVals[0]), 10) - } else { - myRow = strconv.FormatFloat(myAggVals[0], 'f', 6, 64) +func (reader *Input) aggFuncToStr(aggVals []float64) string { + // Define a number formatting function + numToStr := func(f float64) string { + if f == math.Trunc(f) { + return strconv.FormatInt(int64(f), 10) + } + return strconv.FormatFloat(f, 'f', 6, 64) } - for i := 1; i < len(myAggVals); i++ { - if myAggVals[i] == math.Trunc(myAggVals[i]) { - aggregateval = strconv.FormatInt(int64(myAggVals[i]), 10) - } else { - aggregateval = strconv.FormatFloat(myAggVals[i], 'f', 6, 64) - } - myRow = myRow + reader.options.OutputFieldDelimiter + aggregateval + // Display all whole numbers in aggVals as integers + vals := make([]string, len(aggVals)) + for i, v := range aggVals { + vals[i] = numToStr(v) } - return myRow + + // Intersperse field delimiter + return strings.Join(vals, reader.options.OutputFieldDelimiter) } // checkForDuplicates ensures we do not have an ambigious column name. diff --git a/pkg/s3select/select.go b/pkg/s3select/select.go index 5294a963e..59b85746c 100644 --- a/pkg/s3select/select.go +++ b/pkg/s3select/select.go @@ -284,11 +284,7 @@ func (reader *Input) processSelectReq(reqColNames []string, alias string, whereC // printAsterix helps to print out the entire row if an asterix is used. func (reader *Input) printAsterix(record []string) string { - myRow := record[0] - for i := 1; i < len(record); i++ { - myRow = myRow + reader.options.OutputFieldDelimiter + record[i] - } - return myRow + return strings.Join(record, reader.options.OutputFieldDelimiter) } // processColumnNames is a function which allows for cleaning of column names. @@ -304,7 +300,7 @@ func (reader *Input) processColumnNames(reqColNames []string, alias string) erro // processColNameIndex is the function which creates the row for an index based // query. func (reader *Input) processColNameIndex(record []string, reqColNames []string, columns []string) (string, error) { - myRow := "" + row := make([]string, len(reqColNames)) for i := 0; i < len(reqColNames); i++ { // COALESCE AND NULLIF do not support index based access. if reqColNames[0] == "0" { @@ -312,54 +308,48 @@ func (reader *Input) processColNameIndex(record []string, reqColNames []string, } // Subtract 1 because AWS Indexing is not 0 based, it starts at 1. mytempindex, err := strconv.Atoi(reqColNames[i]) + if err != nil { + return "", ErrMissingHeaders + } mytempindex = mytempindex - 1 if mytempindex > len(columns) { return "", ErrInvalidColumnIndex } - myRow = writeRow(myRow, record[mytempindex], reader.options.OutputFieldDelimiter, len(reqColNames)) - if err != nil { - return "", ErrMissingHeaders - } + row[i] = record[mytempindex] } - if len(myRow) > 1000000 { + rowStr := strings.Join(row, reader.options.OutputFieldDelimiter) + if len(rowStr) > 1000000 { return "", ErrOverMaxRecordSize } - if strings.Count(myRow, reader.options.OutputFieldDelimiter) != len(reqColNames)-1 { - myRow = qualityCheck(myRow, len(reqColNames)-1-strings.Count(myRow, reader.options.OutputFieldDelimiter), reader.options.OutputFieldDelimiter) - } - return myRow, nil + return rowStr, nil } // processColNameLiteral is the function which creates the row for an name based // query. func (reader *Input) processColNameLiteral(record []string, reqColNames []string, columns []string, columnsMap map[string]int, myFunc *SelectFuncs) (string, error) { - myRow := "" + row := make([]string, len(reqColNames)) for i := 0; i < len(reqColNames); i++ { // this is the case to deal with COALESCE. if reqColNames[i] == "" && isValidFunc(myFunc.index, i) { - myVal := evaluateFuncExpr(myFunc.funcExpr[i], "", record, columnsMap) - myRow = writeRow(myRow, myVal, reader.options.OutputFieldDelimiter, len(reqColNames)) + row[i] = evaluateFuncExpr(myFunc.funcExpr[i], "", record, columnsMap) continue } myTempIndex, notFound := columnsMap[trimQuotes(reqColNames[i])] if !notFound { return "", ErrMissingHeaders } - myRow = writeRow(myRow, record[myTempIndex], reader.options.OutputFieldDelimiter, len(reqColNames)) + row[i] = record[myTempIndex] } - if len(myRow) > 1000000 { + rowStr := strings.Join(row, reader.options.OutputFieldDelimiter) + if len(rowStr) > 1000000 { return "", ErrOverMaxRecordSize } - if strings.Count(myRow, reader.options.OutputFieldDelimiter) != len(reqColNames)-1 { - myRow = qualityCheck(myRow, len(reqColNames)-1-strings.Count(myRow, reader.options.OutputFieldDelimiter), reader.options.OutputFieldDelimiter) - } - return myRow, nil - + return rowStr, nil } -// aggregationFunctions is a function which performs the actual aggregation -// methods on the given row, it uses an array defined the the main parsing -// function to keep track of values. +// aggregationFunctions performs the actual aggregation methods on the +// given row, it uses an array defined for the main parsing function +// to keep track of values. func aggregationFunctions(counter int, filtrCount int, myAggVals []float64, columnsMap map[string]int, storeReqCols []string, storeFunctions []string, record []string) error { for i := 0; i < len(storeFunctions); i++ { if storeFunctions[i] == "" {