Implement oboard diagnostics admin API (#9024)

- Implement a graph algorithm to test network bandwidth from every 
  node to every other node
- Saturate any network bandwidth adaptively, accounting for slow 
  and fast network capacity
- Implement parallel drive OBD tests
- Implement a paging mechanism for OBD test to provide periodic updates to client
- Implement Sys, Process, Host, Mem OBD Infos
This commit is contained in:
Sidhartha Mani
2020-03-26 21:07:39 -07:00
committed by GitHub
parent 2777956581
commit 0c80bf45d0
21 changed files with 2153 additions and 7 deletions

184
pkg/disk/obd.go Normal file
View File

@@ -0,0 +1,184 @@
/*
* MinIO Cloud Storage, (C) 2020 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package disk
import (
"context"
"fmt"
"os"
"path/filepath"
"runtime"
"time"
"github.com/montanaflynn/stats"
)
const (
kb = uint64(1 << 10)
mb = uint64(kb << 10)
gb = uint64(mb << 10)
)
var globalLatency = map[string]Latency{}
var globalThroughput = map[string]Throughput{}
// Latency holds latency information for write operations to the drive
type Latency struct {
Avg float64 `json:"avg_secs,omitempty"`
Percentile50 float64 `json:"percentile50_secs,omitempty"`
Percentile90 float64 `json:"percentile90_secs,omitempty"`
Percentile99 float64 `json:"percentile99_secs,omitempty"`
Min float64 `json:"min_secs,omitempty"`
Max float64 `json:"max_secs,omitempty"`
}
// Throughput holds throughput information for write operations to the drive
type Throughput struct {
Avg float64 `json:"avg_bytes_per_sec,omitempty"`
Percentile50 float64 `json:"percentile50_bytes_per_sec,omitempty"`
Percentile90 float64 `json:"percentile90_bytes_per_sec,omitempty"`
Percentile99 float64 `json:"percentile99_bytes_per_sec,omitempty"`
Min float64 `json:"min_bytes_per_sec,omitempty"`
Max float64 `json:"max_bytes_per_sec,omitempty"`
}
// GetOBDInfo about the drive
func GetOBDInfo(ctx context.Context, endpoint string) (Latency, Throughput, error) {
runtime.LockOSThread()
f, err := OpenFileDirectIO(endpoint, os.O_CREATE|os.O_RDWR|os.O_TRUNC, 0755)
if err != nil {
return Latency{}, Throughput{}, err
}
defer func() {
f.Close()
os.Remove(f.Name())
}()
drive := filepath.Dir(endpoint)
// going to leave this here incase we decide to go back to caching again
// if gl, ok := globalLatency[drive]; ok {
// if gt, ok := globalThroughput[drive]; ok {
// return gl, gt, nil
// }
// }
blockSize := 1 * mb
fileSize := 256 * mb
latencies := make([]float64, fileSize/blockSize)
throughputs := make([]float64, fileSize/blockSize)
dioFile := os.NewFile(uintptr(f.Fd()), endpoint)
data := make([]byte, blockSize)
for i := uint64(0); i < (fileSize / blockSize); i++ {
if ctx.Err() != nil {
return Latency{}, Throughput{}, ctx.Err()
}
startTime := time.Now()
if n, err := dioFile.Write(data); err != nil {
return Latency{}, Throughput{}, err
} else if uint64(n) != blockSize {
return Latency{}, Throughput{}, fmt.Errorf("Expected to write %d, but only wrote %d", blockSize, n)
}
latency := time.Since(startTime)
latencies[i] = float64(latency.Seconds())
}
runtime.UnlockOSThread()
for i := range latencies {
throughput := float64(blockSize) / latencies[i]
throughputs[i] = throughput
}
var avgLatency float64
var percentile50Latency float64
var percentile90Latency float64
var percentile99Latency float64
var minLatency float64
var maxLatency float64
var avgThroughput float64
var percentile50Throughput float64
var percentile90Throughput float64
var percentile99Throughput float64
var minThroughput float64
var maxThroughput float64
if avgLatency, err = stats.Mean(latencies); err != nil {
return Latency{}, Throughput{}, err
}
if percentile50Latency, err = stats.Percentile(latencies, 50); err != nil {
return Latency{}, Throughput{}, err
}
if percentile90Latency, err = stats.Percentile(latencies, 90); err != nil {
return Latency{}, Throughput{}, err
}
if percentile99Latency, err = stats.Percentile(latencies, 99); err != nil {
return Latency{}, Throughput{}, err
}
if maxLatency, err = stats.Max(latencies); err != nil {
return Latency{}, Throughput{}, err
}
if minLatency, err = stats.Min(latencies); err != nil {
return Latency{}, Throughput{}, err
}
l := Latency{
Avg: avgLatency,
Percentile50: percentile50Latency,
Percentile90: percentile90Latency,
Percentile99: percentile99Latency,
Min: minLatency,
Max: maxLatency,
}
if avgThroughput, err = stats.Mean(throughputs); err != nil {
return Latency{}, Throughput{}, err
}
if percentile50Throughput, err = stats.Percentile(throughputs, 50); err != nil {
return Latency{}, Throughput{}, err
}
if percentile90Throughput, err = stats.Percentile(throughputs, 90); err != nil {
return Latency{}, Throughput{}, err
}
if percentile99Throughput, err = stats.Percentile(throughputs, 99); err != nil {
return Latency{}, Throughput{}, err
}
if maxThroughput, err = stats.Max(throughputs); err != nil {
return Latency{}, Throughput{}, err
}
if minThroughput, err = stats.Min(throughputs); err != nil {
return Latency{}, Throughput{}, err
}
t := Throughput{
Avg: avgThroughput,
Percentile50: percentile50Throughput,
Percentile90: percentile90Throughput,
Percentile99: percentile99Throughput,
Min: minThroughput,
Max: maxThroughput,
}
globalLatency[drive] = l
globalThroughput[drive] = t
return l, t, nil
}

View File

@@ -51,6 +51,8 @@ const (
ServerHardwareInfoAdminAction = "admin:HardwareInfo"
// ServerInfoAdminAction - allow listing server info
ServerInfoAdminAction = "admin:ServerInfo"
// OBDInfoAdminAction - allow obtaining cluster on-board diagnostics
OBDInfoAdminAction = "admin:OBDInfo"
// ServerUpdateAdminAction - allow MinIO binary update
ServerUpdateAdminAction = "admin:ServerUpdate"

291
pkg/madmin/obd.go Normal file
View File

@@ -0,0 +1,291 @@
/*
* MinIO Cloud Storage, (C) 2020 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package madmin
import (
"context"
"encoding/json"
"io"
"net/http"
"net/url"
"time"
"github.com/minio/minio/pkg/disk"
"github.com/minio/minio/pkg/net"
"github.com/shirou/gopsutil/cpu"
"github.com/shirou/gopsutil/host"
"github.com/shirou/gopsutil/mem"
nethw "github.com/shirou/gopsutil/net"
"github.com/shirou/gopsutil/process"
)
// OBDInfo - MinIO cluster's OBD Info
type OBDInfo struct {
TimeStamp time.Time `json:"timestamp,omitempty"`
Error string `json:"error,omitempty"`
Perf PerfOBDInfo `json:"perf,omitempty"`
Minio MinioOBDInfo `json:"minio,omitempty"`
Sys SysOBDInfo `json:"sys,omitempty"`
}
// SysOBDInfo - Includes hardware and system information of the MinIO cluster
type SysOBDInfo struct {
CPUInfo []ServerCPUOBDInfo `json:"cpus,omitempty"`
DiskHwInfo []ServerDiskHwOBDInfo `json:"disks,omitempty"`
OsInfo []ServerOsOBDInfo `json:"osinfos,omitempty"`
MemInfo []ServerMemOBDInfo `json:"meminfos,omitempty"`
ProcInfo []ServerProcOBDInfo `json:"procinfos,omitempty"`
Error string `json:"error,omitempty"`
}
// ServerProcOBDInfo - Includes host process lvl information
type ServerProcOBDInfo struct {
Addr string `json:"addr"`
Processes []SysOBDProcess `json:"processes,omitempty"`
Error string `json:"error,omitempty"`
}
// SysOBDProcess - Includes process lvl information about a single process
type SysOBDProcess struct {
Pid int32 `json:"pid"`
Background bool `json:"background,omitempty"`
CPUPercent float64 `json:"cpupercent,omitempty"`
Children []int32 `json:"children,omitempty"`
CmdLine string `json:"cmd,omitempty"`
Connections []nethw.ConnectionStat `json:"connections,omitempty"`
CreateTime int64 `json:"createtime,omitempty"`
Cwd string `json:"cwd,omitempty"`
Exe string `json:"exe,omitempty"`
Gids []int32 `json:"gids,omitempty"`
IOCounters *process.IOCountersStat `json:"iocounters,omitempty"`
IsRunning bool `json:"isrunning,omitempty"`
MemInfo *process.MemoryInfoStat `json:"meminfo,omitempty"`
MemMaps *[]process.MemoryMapsStat `json:"memmaps,omitempty"`
MemPercent float32 `json:"mempercent,omitempty"`
Name string `json:"name,omitempty"`
NetIOCounters []nethw.IOCountersStat `json:"netiocounters,omitempty"`
Nice int32 `json:"nice,omitempty"`
NumCtxSwitches *process.NumCtxSwitchesStat `json:"numctxswitches,omitempty"`
NumFds int32 `json:"numfds,omitempty"`
NumThreads int32 `json:"numthreads,omitempty"`
OpenFiles []process.OpenFilesStat `json:"openfiles,omitempty"`
PageFaults *process.PageFaultsStat `json:"pagefaults,omitempty"`
Parent int32 `json:"parent,omitempty"`
Ppid int32 `json:"ppid,omitempty"`
Rlimit []process.RlimitStat `json:"rlimit,omitempty"`
Status string `json:"status,omitempty"`
Tgid int32 `json:"tgid,omitempty"`
Threads map[int32]*cpu.TimesStat `json:"threadstats,omitempty"`
Times *cpu.TimesStat `json:"cputimes,omitempty"`
Uids []int32 `json:"uidsomitempty"`
Username string `json:"username,omitempty"`
}
// ServerMemOBDInfo - Includes host virtual and swap mem information
type ServerMemOBDInfo struct {
Addr string `json:"addr"`
SwapMem *mem.SwapMemoryStat `json:"swap,omitempty"`
VirtualMem *mem.VirtualMemoryStat `json:"virtualmem,omitempty"`
Error string `json:"error,omitempty"`
}
// ServerOsOBDInfo - Includes host os information
type ServerOsOBDInfo struct {
Addr string `json:"addr"`
Info *host.InfoStat `json:"info,omitempty"`
Sensors []host.TemperatureStat `json:"sensors,omitempty"`
Users []host.UserStat `json:"users,omitempty"`
Error string `json:"error,omitempty"`
}
// ServerCPUOBDInfo - Includes cpu and timer stats of each node of the MinIO cluster
type ServerCPUOBDInfo struct {
Addr string `json:"addr"`
CPUStat []cpu.InfoStat `json:"cpu,omitempty"`
TimeStat []cpu.TimesStat `json:"time,omitempty"`
Error string `json:"error,omitempty"`
}
// MinioOBDInfo - Includes MinIO confifuration information
type MinioOBDInfo struct {
Info InfoMessage `json:"info,omitempty"`
Config interface{} `json:"config,omitempty"`
Error string `json:"error,omitempty"`
}
// PerfOBDInfo - Includes Drive and Net perf info for the entire MinIO cluster
type PerfOBDInfo struct {
DriveInfo []ServerDrivesOBDInfo `json:"drives,omitempty"`
Net []ServerNetOBDInfo `json:"net,omitempty"`
Error string `json:"error,omitempty"`
}
// ServerDrivesOBDInfo - Drive OBD info about all drives in a single MinIO node
type ServerDrivesOBDInfo struct {
Addr string `json:"addr"`
Serial []DriveOBDInfo `json:"serial,omitempty"`
Parallel []DriveOBDInfo `json:"parallel,omitempty"`
Error string `json:"error,omitempty"`
}
// DriveOBDInfo - Stats about a single drive in a MinIO node
type DriveOBDInfo struct {
Path string `json:"endpoint"`
Latency disk.Latency `json:"latency,omitempty"`
Throughput disk.Throughput `json:"throughput,omitempty"`
Error string `json:"error,omitempty"`
}
// ServerNetOBDInfo - Network OBD info about a single MinIO node
type ServerNetOBDInfo struct {
Addr string `json:"addr"`
Net []NetOBDInfo `json:"net,omitempty"`
Error string `json:"error,omitempty"`
}
// NetOBDInfo - one-to-one network connectivity Stats between 2 MinIO nodes
type NetOBDInfo struct {
Addr string `json:"remote"`
Latency net.Latency `json:"latency,omitempty"`
Throughput net.Throughput `json:"throughput,omitempty"`
Error string `json:"error,omitempty"`
}
// OBDDataType - Typed OBD data types
type OBDDataType string
// OBDDataTypes
const (
OBDDataTypePerfDrive OBDDataType = "perfdrive"
OBDDataTypePerfNet OBDDataType = "perfnet"
OBDDataTypeMinioInfo OBDDataType = "minioinfo"
OBDDataTypeMinioConfig OBDDataType = "minioconfig"
OBDDataTypeSysCPU OBDDataType = "syscpu"
OBDDataTypeSysDiskHw OBDDataType = "sysdiskhw"
OBDDataTypeSysDocker OBDDataType = "sysdocker" // is this really needed?
OBDDataTypeSysOsInfo OBDDataType = "sysosinfo"
OBDDataTypeSysLoad OBDDataType = "sysload" // provides very little info. Making it TBD
OBDDataTypeSysMem OBDDataType = "sysmem"
OBDDataTypeSysNet OBDDataType = "sysnet"
OBDDataTypeSysProcess OBDDataType = "sysprocess"
)
// OBDDataTypesMap - Map of OBD datatypes
var OBDDataTypesMap = map[string]OBDDataType{
"perfdrive": OBDDataTypePerfDrive,
"perfnet": OBDDataTypePerfNet,
"minioinfo": OBDDataTypeMinioInfo,
"minioconfig": OBDDataTypeMinioConfig,
"syscpu": OBDDataTypeSysCPU,
"sysdiskhw": OBDDataTypeSysDiskHw,
"sysdocker": OBDDataTypeSysDocker,
"sysosinfo": OBDDataTypeSysOsInfo,
"sysload": OBDDataTypeSysLoad,
"sysmem": OBDDataTypeSysMem,
"sysnet": OBDDataTypeSysNet,
"sysprocess": OBDDataTypeSysProcess,
}
// OBDDataTypesList - List of OBD datatypes
var OBDDataTypesList = []OBDDataType{
OBDDataTypePerfDrive,
OBDDataTypePerfNet,
OBDDataTypeMinioInfo,
OBDDataTypeMinioConfig,
OBDDataTypeSysCPU,
OBDDataTypeSysDiskHw,
OBDDataTypeSysDocker,
OBDDataTypeSysOsInfo,
OBDDataTypeSysLoad,
OBDDataTypeSysMem,
OBDDataTypeSysNet,
OBDDataTypeSysProcess,
}
// ServerOBDInfo - Connect to a minio server and call OBD Info Management API
// to fetch server's information represented by OBDInfo structure
func (adm *AdminClient) ServerOBDInfo(ctx context.Context, obdDataTypes []OBDDataType) <-chan OBDInfo {
respChan := make(chan OBDInfo)
go func() {
v := url.Values{}
// start with all set to false
for _, d := range OBDDataTypesList {
v.Set(string(d), "false")
}
// only 'trueify' user provided values
for _, d := range obdDataTypes {
v.Set(string(d), "true")
}
var OBDInfoMessage OBDInfo
if v.Get(string(OBDDataTypeMinioInfo)) == "true" {
info, err := adm.ServerInfo(ctx)
if err != nil {
respChan <- OBDInfo{
Error: err.Error(),
}
return
}
OBDInfoMessage.Minio.Info = info
respChan <- OBDInfoMessage
}
resp, err := adm.executeMethod(ctx, "GET", requestData{
relPath: adminAPIPrefix + "/obdinfo",
queryValues: v,
})
defer closeResponse(resp)
if err != nil {
respChan <- OBDInfo{
Error: err.Error(),
}
return
}
// Check response http status code
if resp.StatusCode != http.StatusOK {
respChan <- OBDInfo{
Error: httpRespToErrorResponse(resp).Error(),
}
return
}
// Unmarshal the server's json response
decoder := json.NewDecoder(resp.Body)
for {
err := decoder.Decode(&OBDInfoMessage)
if err == io.EOF {
break
}
if err != nil {
respChan <- OBDInfo{
Error: err.Error(),
}
}
respChan <- OBDInfoMessage
}
OBDInfoMessage.TimeStamp = time.Now()
respChan <- OBDInfoMessage
close(respChan)
}()
return respChan
}

View File

@@ -0,0 +1,7 @@
package madmin
// ServerDiskHwOBDInfo - Includes usage counters, disk counters and partitions
type ServerDiskHwOBDInfo struct {
Addr string `json:"addr"`
Error string `json:"error,omitempty"`
}

33
pkg/madmin/obd_other.go Normal file
View File

@@ -0,0 +1,33 @@
// +build !freebsd
/*
* MinIO Cloud Storage, (C) 2020 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package madmin
import (
diskhw "github.com/shirou/gopsutil/disk"
)
// ServerDiskHwOBDInfo - Includes usage counters, disk counters and partitions
type ServerDiskHwOBDInfo struct {
Addr string `json:"addr"`
Usage []*diskhw.UsageStat `json:"usages,omitempty"`
Partitions []diskhw.PartitionStat `json:"partitions,omitempty"`
Counters map[string]diskhw.IOCountersStat `json:"counters,omitempty"`
Error string `json:"error,omitempty"`
}

116
pkg/net/obd.go Normal file
View File

@@ -0,0 +1,116 @@
/*
* MinIO Cloud Storage, (C) 2020 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package net
import (
"github.com/montanaflynn/stats"
)
// Latency holds latency information for read/write operations to the drive
type Latency struct {
Avg float64 `json:"avg_secs,omitempty"`
Percentile50 float64 `json:"percentile50_secs,omitempty"`
Percentile90 float64 `json:"percentile90_secs,omitempty"`
Percentile99 float64 `json:"percentile99_secs,omitempty"`
Min float64 `json:"min_secs,omitempty"`
Max float64 `json:"max_secs,omitempty"`
}
// Throughput holds throughput information for read/write operations to the drive
type Throughput struct {
Avg float64 `json:"avg_bytes_per_sec,omitempty"`
Percentile50 float64 `json:"percentile50_bytes_per_sec,omitempty"`
Percentile90 float64 `json:"percentile90_bytes_per_sec,omitempty"`
Percentile99 float64 `json:"percentile99_bytes_per_sec,omitempty"`
Min float64 `json:"min_bytes_per_sec,omitempty"`
Max float64 `json:"max_bytes_per_sec,omitempty"`
}
// ComputeOBDStats takes arrays of Latency & Throughput to compute Statistics
func ComputeOBDStats(latencies, throughputs []float64) (Latency, Throughput, error) {
var avgLatency float64
var percentile50Latency float64
var percentile90Latency float64
var percentile99Latency float64
var minLatency float64
var maxLatency float64
var avgThroughput float64
var percentile50Throughput float64
var percentile90Throughput float64
var percentile99Throughput float64
var minThroughput float64
var maxThroughput float64
var err error
if avgLatency, err = stats.Mean(latencies); err != nil {
return Latency{}, Throughput{}, err
}
if percentile50Latency, err = stats.Percentile(latencies, 50); err != nil {
return Latency{}, Throughput{}, err
}
if percentile90Latency, err = stats.Percentile(latencies, 90); err != nil {
return Latency{}, Throughput{}, err
}
if percentile99Latency, err = stats.Percentile(latencies, 99); err != nil {
return Latency{}, Throughput{}, err
}
if maxLatency, err = stats.Max(latencies); err != nil {
return Latency{}, Throughput{}, err
}
if minLatency, err = stats.Min(latencies); err != nil {
return Latency{}, Throughput{}, err
}
l := Latency{
Avg: avgLatency,
Percentile50: percentile50Latency,
Percentile90: percentile90Latency,
Percentile99: percentile99Latency,
Min: minLatency,
Max: maxLatency,
}
if avgThroughput, err = stats.Mean(throughputs); err != nil {
return Latency{}, Throughput{}, err
}
if percentile50Throughput, err = stats.Percentile(throughputs, 50); err != nil {
return Latency{}, Throughput{}, err
}
if percentile90Throughput, err = stats.Percentile(throughputs, 90); err != nil {
return Latency{}, Throughput{}, err
}
if percentile99Throughput, err = stats.Percentile(throughputs, 99); err != nil {
return Latency{}, Throughput{}, err
}
if maxThroughput, err = stats.Max(throughputs); err != nil {
return Latency{}, Throughput{}, err
}
if minThroughput, err = stats.Min(throughputs); err != nil {
return Latency{}, Throughput{}, err
}
t := Throughput{
Avg: avgThroughput,
Percentile50: percentile50Throughput,
Percentile90: percentile90Throughput,
Percentile99: percentile99Throughput,
Min: minThroughput,
Max: maxThroughput,
}
return l, t, nil
}