Add sufficient deadlines and countermeasures to handle hung node scenario (#19688)

Signed-off-by: Shubhendu Ram Tripathi <shubhendu@minio.io>
Signed-off-by: Harshavardhana <harsha@minio.io>
This commit is contained in:
Shubhendu
2024-05-23 04:37:14 +05:30
committed by GitHub
parent ca80eced24
commit 7c7650b7c3
34 changed files with 292 additions and 133 deletions

View File

@@ -42,6 +42,7 @@ import (
xioutil "github.com/minio/minio/internal/ioutil"
"github.com/minio/minio/internal/logger"
"github.com/minio/minio/internal/pubsub"
xnet "github.com/minio/pkg/v2/net"
"github.com/puzpuzpuz/xsync/v3"
"github.com/tinylib/msgp/msgp"
"github.com/zeebo/xxh3"
@@ -677,9 +678,8 @@ func (c *Connection) connect() {
return
}
if gotState != StateConnecting {
// Don't print error on first attempt,
// and after that only once per hour.
gridLogOnceIf(c.ctx, fmt.Errorf("grid: %s connecting to %s: %w (%T) Sleeping %v (%v)", c.Local, toDial, err, err, sleep, gotState), toDial)
// Don't print error on first attempt, and after that only once per hour.
gridLogOnceIf(c.ctx, fmt.Errorf("grid: %s re-connecting to %s: %w (%T) Sleeping %v (%v)", c.Local, toDial, err, err, sleep, gotState), toDial)
}
c.updateState(StateConnectionError)
time.Sleep(sleep)
@@ -972,7 +972,9 @@ func (c *Connection) handleMessages(ctx context.Context, conn net.Conn) {
msg, err = readDataInto(msg, conn, c.side, ws.OpBinary)
if err != nil {
cancel(ErrDisconnected)
gridLogIfNot(ctx, fmt.Errorf("ws read: %w", err), net.ErrClosed, io.EOF)
if !xnet.IsNetworkOrHostDown(err, true) {
gridLogIfNot(ctx, fmt.Errorf("ws read: %w", err), net.ErrClosed, io.EOF)
}
return
}
if c.incomingBytes != nil {
@@ -983,7 +985,9 @@ func (c *Connection) handleMessages(ctx context.Context, conn net.Conn) {
var m message
subID, remain, err := m.parse(msg)
if err != nil {
gridLogIf(ctx, fmt.Errorf("ws parse package: %w", err))
if !xnet.IsNetworkOrHostDown(err, true) {
gridLogIf(ctx, fmt.Errorf("ws parse package: %w", err))
}
cancel(ErrDisconnected)
return
}
@@ -1004,7 +1008,9 @@ func (c *Connection) handleMessages(ctx context.Context, conn net.Conn) {
var next []byte
next, remain, err = msgp.ReadBytesZC(remain)
if err != nil {
gridLogIf(ctx, fmt.Errorf("ws read merged: %w", err))
if !xnet.IsNetworkOrHostDown(err, true) {
gridLogIf(ctx, fmt.Errorf("ws read merged: %w", err))
}
cancel(ErrDisconnected)
return
}
@@ -1012,7 +1018,9 @@ func (c *Connection) handleMessages(ctx context.Context, conn net.Conn) {
m.Payload = nil
subID, _, err = m.parse(next)
if err != nil {
gridLogIf(ctx, fmt.Errorf("ws parse merged: %w", err))
if !xnet.IsNetworkOrHostDown(err, true) {
gridLogIf(ctx, fmt.Errorf("ws parse merged: %w", err))
}
cancel(ErrDisconnected)
return
}
@@ -1119,18 +1127,24 @@ func (c *Connection) handleMessages(ctx context.Context, conn net.Conn) {
buf.Reset()
err := wsw.writeMessage(&buf, c.side, ws.OpBinary, toSend)
if err != nil {
gridLogIf(ctx, fmt.Errorf("ws writeMessage: %w", err))
if !xnet.IsNetworkOrHostDown(err, true) {
gridLogIf(ctx, fmt.Errorf("ws writeMessage: %w", err))
}
return
}
PutByteBuffer(toSend)
err = conn.SetWriteDeadline(time.Now().Add(connWriteTimeout))
if err != nil {
gridLogIf(ctx, fmt.Errorf("conn.SetWriteDeadline: %w", err))
return
}
_, err = buf.WriteTo(conn)
if err != nil {
gridLogIf(ctx, fmt.Errorf("ws write: %w", err))
if !xnet.IsNetworkOrHostDown(err, true) {
gridLogIf(ctx, fmt.Errorf("ws write: %w", err))
}
return
}
continue
@@ -1163,18 +1177,24 @@ func (c *Connection) handleMessages(ctx context.Context, conn net.Conn) {
buf.Reset()
err = wsw.writeMessage(&buf, c.side, ws.OpBinary, toSend)
if err != nil {
gridLogIf(ctx, fmt.Errorf("ws writeMessage: %w", err))
if !xnet.IsNetworkOrHostDown(err, true) {
gridLogIf(ctx, fmt.Errorf("ws writeMessage: %w", err))
}
return
}
// buf is our local buffer, so we can reuse it.
err = conn.SetWriteDeadline(time.Now().Add(connWriteTimeout))
if err != nil {
gridLogIf(ctx, fmt.Errorf("conn.SetWriteDeadline: %w", err))
return
}
// buf is our local buffer, so we can reuse it.
_, err = buf.WriteTo(conn)
if err != nil {
gridLogIf(ctx, fmt.Errorf("ws write: %w", err))
if !xnet.IsNetworkOrHostDown(err, true) {
gridLogIf(ctx, fmt.Errorf("ws write: %w", err))
}
return
}