minio/cmd/xl-v1-utils.go
Aditya Manthramurthy 8975da4e84 Add new ReadFileWithVerify storage-layer API (#4349)
This is an enhancement to the XL/distributed-XL mode. FS mode is
unaffected.

The ReadFileWithVerify storage-layer call is similar to ReadFile with
the additional functionality of performing bit-rot checking. It
accepts additional parameters for a hashing algorithm to use and the
expected hex-encoded hash string.

This patch provides significant performance improvement because:

1. combines the step of reading the file (during
erasure-decoding/reconstruction) with bit-rot verification;

2. limits the number of file-reads; and

3. avoids transferring the file over the network for bit-rot
verification.

ReadFile API is implemented as ReadFileWithVerify with empty hashing
arguments.

Credits to AB and Harsha for the algorithmic improvement.

Fixes #4236.
2017-05-16 14:21:52 -07:00

392 lines
12 KiB
Go

/*
* Minio Cloud Storage, (C) 2016 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cmd
import (
"errors"
"hash/crc32"
"path"
"sync"
"time"
"github.com/tidwall/gjson"
)
// Returns number of errors that occurred the most (incl. nil) and the
// corresponding error value. N B when there is more than one error value that
// occurs maximum number of times, the error value returned depends on how
// golang's map orders keys. This doesn't affect correctness as long as quorum
// value is greater than or equal to simple majority, since none of the equally
// maximal values would occur quorum or more number of times.
func reduceErrs(errs []error, ignoredErrs []error) (maxCount int, maxErr error) {
errorCounts := make(map[error]int)
errs = errorsCause(errs)
for _, err := range errs {
if isErrIgnored(err, ignoredErrs...) {
continue
}
errorCounts[err]++
}
max := 0
for err, count := range errorCounts {
switch {
case max < count:
max = count
maxErr = err
// Prefer `nil` over other error values with the same
// number of occurrences.
case max == count && err == nil:
maxErr = err
}
}
return max, maxErr
}
// reduceQuorumErrs behaves like reduceErrs by only for returning
// values of maximally occurring errors validated against a generic
// quorum number can be read or write quorum depending on usage.
// additionally a special error is provided as well to be returned
// in case quorum is not satisfied.
func reduceQuorumErrs(errs []error, ignoredErrs []error, quorum int, quorumErr error) (maxErr error) {
maxCount, maxErr := reduceErrs(errs, ignoredErrs)
if maxErr == nil && maxCount >= quorum {
// Success in quorum.
return nil
}
if maxErr != nil && maxCount >= quorum {
// Errors in quorum.
return traceError(maxErr, errs...)
}
// No quorum satisfied.
maxErr = traceError(quorumErr, errs...)
return
}
// reduceReadQuorumErrs behaves like reduceErrs but only for returning
// values of maximally occurring errors validated against readQuorum.
func reduceReadQuorumErrs(errs []error, ignoredErrs []error, readQuorum int) (maxErr error) {
return reduceQuorumErrs(errs, ignoredErrs, readQuorum, errXLReadQuorum)
}
// reduceWriteQuorumErrs behaves like reduceErrs but only for returning
// values of maximally occurring errors validated against writeQuorum.
func reduceWriteQuorumErrs(errs []error, ignoredErrs []error, writeQuorum int) (maxErr error) {
return reduceQuorumErrs(errs, ignoredErrs, writeQuorum, errXLWriteQuorum)
}
// Similar to 'len(slice)' but returns the actual elements count
// skipping the unallocated elements.
func diskCount(disks []StorageAPI) int {
diskCount := 0
for _, disk := range disks {
if disk == nil {
continue
}
diskCount++
}
return diskCount
}
// hashOrder - hashes input key to return returns consistent
// hashed integer slice. Returned integer order is salted
// with an input key. This results in consistent order.
// NOTE: collisions are fine, we are not looking for uniqueness
// in the slices returned.
func hashOrder(key string, cardinality int) []int {
if cardinality < 0 {
// Returns an empty int slice for negative cardinality.
return nil
}
nums := make([]int, cardinality)
keyCrc := crc32.Checksum([]byte(key), crc32.IEEETable)
start := int(uint32(keyCrc)%uint32(cardinality)) | 1
for i := 1; i <= cardinality; i++ {
nums[i-1] = 1 + ((start + i) % cardinality)
}
return nums
}
func parseXLStat(xlMetaBuf []byte) (statInfo, error) {
// obtain stat info.
stat := statInfo{}
// fetching modTime.
modTime, err := time.Parse(time.RFC3339, gjson.GetBytes(xlMetaBuf, "stat.modTime").String())
if err != nil {
return statInfo{}, err
}
stat.ModTime = modTime
// obtain Stat.Size .
stat.Size = gjson.GetBytes(xlMetaBuf, "stat.size").Int()
return stat, nil
}
func parseXLVersion(xlMetaBuf []byte) string {
return gjson.GetBytes(xlMetaBuf, "version").String()
}
func parseXLFormat(xlMetaBuf []byte) string {
return gjson.GetBytes(xlMetaBuf, "format").String()
}
func parseXLRelease(xlMetaBuf []byte) string {
return gjson.GetBytes(xlMetaBuf, "minio.release").String()
}
func parseXLErasureInfo(xlMetaBuf []byte) erasureInfo {
erasure := erasureInfo{}
erasureResult := gjson.GetBytes(xlMetaBuf, "erasure")
// parse the xlV1Meta.Erasure.Distribution.
disResult := erasureResult.Get("distribution").Array()
distribution := make([]int, len(disResult))
for i, dis := range disResult {
distribution[i] = int(dis.Int())
}
erasure.Distribution = distribution
erasure.Algorithm = HashAlgo(erasureResult.Get("algorithm").String())
erasure.DataBlocks = int(erasureResult.Get("data").Int())
erasure.ParityBlocks = int(erasureResult.Get("parity").Int())
erasure.BlockSize = erasureResult.Get("blockSize").Int()
erasure.Index = int(erasureResult.Get("index").Int())
// Pare xlMetaV1.Erasure.Checksum array.
checkSumsResult := erasureResult.Get("checksum").Array()
checkSums := make([]checkSumInfo, len(checkSumsResult))
for i, checkSumResult := range checkSumsResult {
checkSum := checkSumInfo{}
checkSum.Name = checkSumResult.Get("name").String()
checkSum.Algorithm = HashAlgo(checkSumResult.Get("algorithm").String())
checkSum.Hash = checkSumResult.Get("hash").String()
checkSums[i] = checkSum
}
erasure.Checksum = checkSums
return erasure
}
func parseXLParts(xlMetaBuf []byte) []objectPartInfo {
// Parse the XL Parts.
partsResult := gjson.GetBytes(xlMetaBuf, "parts").Array()
partInfo := make([]objectPartInfo, len(partsResult))
for i, p := range partsResult {
info := objectPartInfo{}
info.Number = int(p.Get("number").Int())
info.Name = p.Get("name").String()
info.ETag = p.Get("etag").String()
info.Size = p.Get("size").Int()
partInfo[i] = info
}
return partInfo
}
func parseXLMetaMap(xlMetaBuf []byte) map[string]string {
// Get xlMetaV1.Meta map.
metaMapResult := gjson.GetBytes(xlMetaBuf, "meta").Map()
metaMap := make(map[string]string)
for key, valResult := range metaMapResult {
metaMap[key] = valResult.String()
}
return metaMap
}
// Constructs XLMetaV1 using `gjson` lib to retrieve each field.
func xlMetaV1UnmarshalJSON(xlMetaBuf []byte) (xlMetaV1, error) {
xlMeta := xlMetaV1{}
// obtain version.
xlMeta.Version = parseXLVersion(xlMetaBuf)
// obtain format.
xlMeta.Format = parseXLFormat(xlMetaBuf)
// Parse xlMetaV1.Stat .
stat, err := parseXLStat(xlMetaBuf)
if err != nil {
return xlMetaV1{}, err
}
xlMeta.Stat = stat
// parse the xlV1Meta.Erasure fields.
xlMeta.Erasure = parseXLErasureInfo(xlMetaBuf)
// Parse the XL Parts.
xlMeta.Parts = parseXLParts(xlMetaBuf)
// Get the xlMetaV1.Realse field.
xlMeta.Minio.Release = parseXLRelease(xlMetaBuf)
// parse xlMetaV1.
xlMeta.Meta = parseXLMetaMap(xlMetaBuf)
return xlMeta, nil
}
// read xl.json from the given disk, parse and return xlV1MetaV1.Parts.
func readXLMetaParts(disk StorageAPI, bucket string, object string) ([]objectPartInfo, error) {
// Reads entire `xl.json`.
xlMetaBuf, err := disk.ReadAll(bucket, path.Join(object, xlMetaJSONFile))
if err != nil {
return nil, traceError(err)
}
// obtain xlMetaV1{}.Partsusing `github.com/tidwall/gjson`.
xlMetaParts := parseXLParts(xlMetaBuf)
return xlMetaParts, nil
}
// read xl.json from the given disk and parse xlV1Meta.Stat and xlV1Meta.Meta using gjson.
func readXLMetaStat(disk StorageAPI, bucket string, object string) (statInfo, map[string]string, error) {
// Reads entire `xl.json`.
xlMetaBuf, err := disk.ReadAll(bucket, path.Join(object, xlMetaJSONFile))
if err != nil {
return statInfo{}, nil, traceError(err)
}
// obtain version.
xlVersion := parseXLVersion(xlMetaBuf)
// obtain format.
xlFormat := parseXLFormat(xlMetaBuf)
// Validate if the xl.json we read is sane, return corrupted format.
if !isXLMetaValid(xlVersion, xlFormat) {
// For version mismatchs and unrecognized format, return corrupted format.
return statInfo{}, nil, traceError(errCorruptedFormat)
}
// obtain xlMetaV1{}.Meta using `github.com/tidwall/gjson`.
xlMetaMap := parseXLMetaMap(xlMetaBuf)
// obtain xlMetaV1{}.Stat using `github.com/tidwall/gjson`.
xlStat, err := parseXLStat(xlMetaBuf)
if err != nil {
return statInfo{}, nil, traceError(err)
}
// Return structured `xl.json`.
return xlStat, xlMetaMap, nil
}
// readXLMeta reads `xl.json` and returns back XL metadata structure.
func readXLMeta(disk StorageAPI, bucket string, object string) (xlMeta xlMetaV1, err error) {
// Reads entire `xl.json`.
xlMetaBuf, err := disk.ReadAll(bucket, path.Join(object, xlMetaJSONFile))
if err != nil {
return xlMetaV1{}, traceError(err)
}
// obtain xlMetaV1{} using `github.com/tidwall/gjson`.
xlMeta, err = xlMetaV1UnmarshalJSON(xlMetaBuf)
if err != nil {
return xlMetaV1{}, traceError(err)
}
// Return structured `xl.json`.
return xlMeta, nil
}
// Reads all `xl.json` metadata as a xlMetaV1 slice.
// Returns error slice indicating the failed metadata reads.
func readAllXLMetadata(disks []StorageAPI, bucket, object string) ([]xlMetaV1, []error) {
errs := make([]error, len(disks))
metadataArray := make([]xlMetaV1, len(disks))
var wg = &sync.WaitGroup{}
// Read `xl.json` parallelly across disks.
for index, disk := range disks {
if disk == nil {
errs[index] = errDiskNotFound
continue
}
wg.Add(1)
// Read `xl.json` in routine.
go func(index int, disk StorageAPI) {
defer wg.Done()
var err error
metadataArray[index], err = readXLMeta(disk, bucket, object)
if err != nil {
errs[index] = err
return
}
}(index, disk)
}
// Wait for all the routines to finish.
wg.Wait()
// Return all the metadata.
return metadataArray, errs
}
// Return shuffled partsMetadata depending on distribution.
func shufflePartsMetadata(partsMetadata []xlMetaV1, distribution []int) (shuffledPartsMetadata []xlMetaV1) {
if distribution == nil {
return partsMetadata
}
shuffledPartsMetadata = make([]xlMetaV1, len(partsMetadata))
// Shuffle slice xl metadata for expected distribution.
for index := range partsMetadata {
blockIndex := distribution[index]
shuffledPartsMetadata[blockIndex-1] = partsMetadata[index]
}
return shuffledPartsMetadata
}
// shuffleDisks - shuffle input disks slice depending on the
// erasure distribution. return shuffled slice of disks with
// their expected distribution.
func shuffleDisks(disks []StorageAPI, distribution []int) (shuffledDisks []StorageAPI) {
if distribution == nil {
return disks
}
shuffledDisks = make([]StorageAPI, len(disks))
// Shuffle disks for expected distribution.
for index := range disks {
blockIndex := distribution[index]
shuffledDisks[blockIndex-1] = disks[index]
}
return shuffledDisks
}
// Errors specifically generated by getPartSizeFromIdx function.
var (
errPartSizeZero = errors.New("Part size cannot be zero")
errPartSizeIndex = errors.New("Part index cannot be smaller than 1")
)
// getPartSizeFromIdx predicts the part size according to its index. It also
// returns -1 when totalSize is also -1.
func getPartSizeFromIdx(totalSize int64, partSize int64, partIndex int) (int64, error) {
if partSize == 0 {
return 0, traceError(errPartSizeZero)
}
if partIndex < 1 {
return 0, traceError(errPartSizeIndex)
}
switch totalSize {
case -1, 0:
return totalSize, nil
}
// Compute the total count of parts
partsCount := totalSize/partSize + 1
// Return the part's size
switch {
case int64(partIndex) < partsCount:
return partSize, nil
case int64(partIndex) == partsCount:
// Size of last part
return totalSize % partSize, nil
default:
return 0, nil
}
}