mirror of
https://github.com/minio/minio.git
synced 2024-12-24 22:25:54 -05:00
Use hdfs.Readdir() to optimize HDFS directory listings (#10121)
Currently, listing directories on HDFS incurs a per-entry remote Stat() call penalty, the cost of which can really blow up on directories with many entries (+1,000) especially when considered in addition to peripheral calls (such as validation) and the fact that minio is an intermediary to the client (whereas other clients listed below can query HDFS directly). Because listing directories this way is expensive, the Golang HDFS library provides the [`Client.Open()`] function which creates a [`FileReader`] that is able to batch multiple calls together through the [`Readdir()`] function. This is substantially more efficient for very large directories. In one case we were witnessing about +20 seconds to list a directory with 1,500 entries, admittedly large, but the Java hdfs ls utility as well as the HDFS library sample ls utility were much faster. Hadoop HDFS DFS (4.02s): λ ~/code/minio → use-readdir » time hdfs dfs -ls /directory/with/1500/entries/ … hdfs dfs -ls 5.81s user 0.49s system 156% cpu 4.020 total Golang HDFS library (0.47s): λ ~/code/hdfs → master » time ./hdfs ls -lh /directory/with/1500/entries/ … ./hdfs ls -lh 0.13s user 0.14s system 56% cpu 0.478 total mc and minio **without** optimization (16.96s): λ ~/code/minio → master » time mc ls myhdfs/directory/with/1500/entries/ … ./mc ls 0.22s user 0.29s system 3% cpu 16.968 total mc and minio **with** optimization (0.40s): λ ~/code/minio → use-readdir » time mc ls myhdfs/directory/with/1500/entries/ … ./mc ls 0.13s user 0.28s system 102% cpu 0.403 total [`Client.Open()`]: https://godoc.org/github.com/colinmarc/hdfs#Client.Open [`FileReader`]: https://godoc.org/github.com/colinmarc/hdfs#FileReader [`Readdir()`]: https://godoc.org/github.com/colinmarc/hdfs#FileReader.Readdir
This commit is contained in:
parent
11593c6cc4
commit
4752323e1c
@ -384,28 +384,78 @@ func (n *hdfsObjects) listDirFactory() minio.ListDirFunc {
|
||||
|
||||
// ListObjects lists all blobs in HDFS bucket filtered by prefix.
|
||||
func (n *hdfsObjects) ListObjects(ctx context.Context, bucket, prefix, marker, delimiter string, maxKeys int) (loi minio.ListObjectsInfo, err error) {
|
||||
if _, err := n.clnt.Stat(n.hdfsPathJoin(bucket)); err != nil {
|
||||
fileInfos := make(map[string]os.FileInfo)
|
||||
directoryPath := n.hdfsPathJoin(bucket, prefix)
|
||||
|
||||
if err = n.populateDirectoryListing(directoryPath, fileInfos); err != nil {
|
||||
return loi, hdfsToObjectErr(ctx, err, bucket)
|
||||
}
|
||||
|
||||
getObjectInfo := func(ctx context.Context, bucket, entry string) (minio.ObjectInfo, error) {
|
||||
fi, err := n.clnt.Stat(n.hdfsPathJoin(bucket, entry))
|
||||
if err != nil {
|
||||
filePath := path.Clean(n.hdfsPathJoin(bucket, entry))
|
||||
fi, ok := fileInfos[filePath]
|
||||
|
||||
// If the file info is not known, this may be a recursive listing and filePath is a
|
||||
// child of a sub-directory. In this case, obtain that sub-directory's listing.
|
||||
if !ok {
|
||||
parentPath := path.Dir(filePath)
|
||||
|
||||
if err := n.populateDirectoryListing(parentPath, fileInfos); err != nil {
|
||||
return minio.ObjectInfo{}, hdfsToObjectErr(ctx, err, bucket)
|
||||
}
|
||||
|
||||
fi, ok = fileInfos[filePath]
|
||||
|
||||
if !ok {
|
||||
err = fmt.Errorf("could not get FileInfo for path '%s'", filePath)
|
||||
return minio.ObjectInfo{}, hdfsToObjectErr(ctx, err, bucket, entry)
|
||||
}
|
||||
return minio.ObjectInfo{
|
||||
}
|
||||
|
||||
objectInfo := minio.ObjectInfo{
|
||||
Bucket: bucket,
|
||||
Name: entry,
|
||||
ModTime: fi.ModTime(),
|
||||
Size: fi.Size(),
|
||||
IsDir: fi.IsDir(),
|
||||
AccTime: fi.(*hdfs.FileInfo).AccessTime(),
|
||||
}, nil
|
||||
}
|
||||
|
||||
delete(fileInfos, filePath)
|
||||
|
||||
return objectInfo, nil
|
||||
}
|
||||
|
||||
return minio.ListObjects(ctx, n, bucket, prefix, marker, delimiter, maxKeys, n.listPool, n.listDirFactory(), getObjectInfo, getObjectInfo)
|
||||
}
|
||||
|
||||
// Lists a path's direct, first-level entries and populates them in the `fileInfos` cache which maps
|
||||
// a path entry to an `os.FileInfo`. It also saves the listed path's `os.FileInfo` in the cache.
|
||||
func (n *hdfsObjects) populateDirectoryListing(filePath string, fileInfos map[string]os.FileInfo) error {
|
||||
dirReader, err := n.clnt.Open(filePath)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
dirStat := dirReader.Stat()
|
||||
key := path.Clean(filePath)
|
||||
|
||||
fileInfos[key] = dirStat
|
||||
infos, err := dirReader.Readdir(0)
|
||||
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, fileInfo := range infos {
|
||||
filePath := n.hdfsPathJoin(filePath, fileInfo.Name())
|
||||
fileInfos[filePath] = fileInfo
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// deleteObject deletes a file path if its empty. If it's successfully deleted,
|
||||
// it will recursively move up the tree, deleting empty parent directories
|
||||
// until it finds one with files in it. Returns nil for a non-empty directory.
|
||||
|
Loading…
Reference in New Issue
Block a user