Fix slow shutdown (#2113)
* rearrange shutdown Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com> * http closed is fine Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com> * update changelog Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com> * logging while shutting Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com> --------- Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
This commit is contained in:
parent
bac7ea67f4
commit
60b94b0467
|
@ -74,6 +74,7 @@ after improving the test harness as part of adopting [#1460](https://github.com/
|
||||||
- Make sure integration tests cover postgres for all scenarios
|
- Make sure integration tests cover postgres for all scenarios
|
||||||
- CLI commands (all except `serve`) only requires minimal configuration, no more errors or warnings from unset settings [#2109](https://github.com/juanfont/headscale/pull/2109)
|
- CLI commands (all except `serve`) only requires minimal configuration, no more errors or warnings from unset settings [#2109](https://github.com/juanfont/headscale/pull/2109)
|
||||||
- CLI results are now concistently sent to stdout and errors to stderr [#2109](https://github.com/juanfont/headscale/pull/2109)
|
- CLI results are now concistently sent to stdout and errors to stderr [#2109](https://github.com/juanfont/headscale/pull/2109)
|
||||||
|
- Fix issue where shutting down headscale would hang [#2113](https://github.com/juanfont/headscale/pull/2113)
|
||||||
|
|
||||||
## 0.22.3 (2023-05-12)
|
## 0.22.3 (2023-05-12)
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,9 @@
|
||||||
package cli
|
package cli
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
|
"net/http"
|
||||||
|
|
||||||
"github.com/rs/zerolog/log"
|
"github.com/rs/zerolog/log"
|
||||||
"github.com/spf13/cobra"
|
"github.com/spf13/cobra"
|
||||||
)
|
)
|
||||||
|
@ -22,8 +25,8 @@ var serveCmd = &cobra.Command{
|
||||||
}
|
}
|
||||||
|
|
||||||
err = app.Serve()
|
err = app.Serve()
|
||||||
if err != nil {
|
if err != nil && !errors.Is(err, http.ErrServerClosed) {
|
||||||
log.Fatal().Caller().Err(err).Msg("Error starting server")
|
log.Fatal().Caller().Err(err).Msg("Headscale ran into an error and had to shut down.")
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -770,7 +770,7 @@ func (h *Headscale) Serve() error {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
trace := log.Trace().Msgf
|
info := func(msg string) { log.Info().Msg(msg) }
|
||||||
log.Info().
|
log.Info().
|
||||||
Str("signal", sig.String()).
|
Str("signal", sig.String()).
|
||||||
Msg("Received signal to stop, shutting down gracefully")
|
Msg("Received signal to stop, shutting down gracefully")
|
||||||
|
@ -778,55 +778,55 @@ func (h *Headscale) Serve() error {
|
||||||
expireNodeCancel()
|
expireNodeCancel()
|
||||||
h.ephemeralGC.Close()
|
h.ephemeralGC.Close()
|
||||||
|
|
||||||
trace("waiting for netmap stream to close")
|
|
||||||
h.pollNetMapStreamWG.Wait()
|
|
||||||
|
|
||||||
// Gracefully shut down servers
|
// Gracefully shut down servers
|
||||||
ctx, cancel := context.WithTimeout(
|
ctx, cancel := context.WithTimeout(
|
||||||
context.Background(),
|
context.Background(),
|
||||||
types.HTTPShutdownTimeout,
|
types.HTTPShutdownTimeout,
|
||||||
)
|
)
|
||||||
trace("shutting down debug http server")
|
info("shutting down debug http server")
|
||||||
if err := debugHTTPServer.Shutdown(ctx); err != nil {
|
if err := debugHTTPServer.Shutdown(ctx); err != nil {
|
||||||
log.Error().Err(err).Msg("Failed to shutdown prometheus http")
|
log.Error().Err(err).Msg("failed to shutdown prometheus http")
|
||||||
}
|
}
|
||||||
trace("shutting down main http server")
|
info("shutting down main http server")
|
||||||
if err := httpServer.Shutdown(ctx); err != nil {
|
if err := httpServer.Shutdown(ctx); err != nil {
|
||||||
log.Error().Err(err).Msg("Failed to shutdown http")
|
log.Error().Err(err).Msg("failed to shutdown http")
|
||||||
}
|
}
|
||||||
|
|
||||||
trace("shutting down grpc server (socket)")
|
info("closing node notifier")
|
||||||
|
h.nodeNotifier.Close()
|
||||||
|
|
||||||
|
info("waiting for netmap stream to close")
|
||||||
|
h.pollNetMapStreamWG.Wait()
|
||||||
|
|
||||||
|
info("shutting down grpc server (socket)")
|
||||||
grpcSocket.GracefulStop()
|
grpcSocket.GracefulStop()
|
||||||
|
|
||||||
if grpcServer != nil {
|
if grpcServer != nil {
|
||||||
trace("shutting down grpc server (external)")
|
info("shutting down grpc server (external)")
|
||||||
grpcServer.GracefulStop()
|
grpcServer.GracefulStop()
|
||||||
grpcListener.Close()
|
grpcListener.Close()
|
||||||
}
|
}
|
||||||
|
|
||||||
if tailsqlContext != nil {
|
if tailsqlContext != nil {
|
||||||
trace("shutting down tailsql")
|
info("shutting down tailsql")
|
||||||
tailsqlContext.Done()
|
tailsqlContext.Done()
|
||||||
}
|
}
|
||||||
|
|
||||||
trace("closing node notifier")
|
|
||||||
h.nodeNotifier.Close()
|
|
||||||
|
|
||||||
// Close network listeners
|
// Close network listeners
|
||||||
trace("closing network listeners")
|
info("closing network listeners")
|
||||||
debugHTTPListener.Close()
|
debugHTTPListener.Close()
|
||||||
httpListener.Close()
|
httpListener.Close()
|
||||||
grpcGatewayConn.Close()
|
grpcGatewayConn.Close()
|
||||||
|
|
||||||
// Stop listening (and unlink the socket if unix type):
|
// Stop listening (and unlink the socket if unix type):
|
||||||
trace("closing socket listener")
|
info("closing socket listener")
|
||||||
socketListener.Close()
|
socketListener.Close()
|
||||||
|
|
||||||
// Close db connections
|
// Close db connections
|
||||||
trace("closing database connection")
|
info("closing database connection")
|
||||||
err = h.db.Close()
|
err = h.db.Close()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Error().Err(err).Msg("Failed to close db")
|
log.Error().Err(err).Msg("failed to close db")
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Info().
|
log.Info().
|
||||||
|
|
|
@ -36,6 +36,7 @@ type Notifier struct {
|
||||||
connected *xsync.MapOf[types.NodeID, bool]
|
connected *xsync.MapOf[types.NodeID, bool]
|
||||||
b *batcher
|
b *batcher
|
||||||
cfg *types.Config
|
cfg *types.Config
|
||||||
|
closed bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewNotifier(cfg *types.Config) *Notifier {
|
func NewNotifier(cfg *types.Config) *Notifier {
|
||||||
|
@ -43,6 +44,7 @@ func NewNotifier(cfg *types.Config) *Notifier {
|
||||||
nodes: make(map[types.NodeID]chan<- types.StateUpdate),
|
nodes: make(map[types.NodeID]chan<- types.StateUpdate),
|
||||||
connected: xsync.NewMapOf[types.NodeID, bool](),
|
connected: xsync.NewMapOf[types.NodeID, bool](),
|
||||||
cfg: cfg,
|
cfg: cfg,
|
||||||
|
closed: false,
|
||||||
}
|
}
|
||||||
b := newBatcher(cfg.Tuning.BatchChangeDelay, n)
|
b := newBatcher(cfg.Tuning.BatchChangeDelay, n)
|
||||||
n.b = b
|
n.b = b
|
||||||
|
@ -51,9 +53,19 @@ func NewNotifier(cfg *types.Config) *Notifier {
|
||||||
return n
|
return n
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close stops the batcher inside the notifier.
|
// Close stops the batcher and closes all channels.
|
||||||
func (n *Notifier) Close() {
|
func (n *Notifier) Close() {
|
||||||
|
notifierWaitersForLock.WithLabelValues("lock", "close").Inc()
|
||||||
|
n.l.Lock()
|
||||||
|
defer n.l.Unlock()
|
||||||
|
notifierWaitersForLock.WithLabelValues("lock", "close").Dec()
|
||||||
|
|
||||||
|
n.closed = true
|
||||||
n.b.close()
|
n.b.close()
|
||||||
|
|
||||||
|
for _, c := range n.nodes {
|
||||||
|
close(c)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (n *Notifier) tracef(nID types.NodeID, msg string, args ...any) {
|
func (n *Notifier) tracef(nID types.NodeID, msg string, args ...any) {
|
||||||
|
@ -70,6 +82,10 @@ func (n *Notifier) AddNode(nodeID types.NodeID, c chan<- types.StateUpdate) {
|
||||||
notifierWaitersForLock.WithLabelValues("lock", "add").Dec()
|
notifierWaitersForLock.WithLabelValues("lock", "add").Dec()
|
||||||
notifierWaitForLock.WithLabelValues("add").Observe(time.Since(start).Seconds())
|
notifierWaitForLock.WithLabelValues("add").Observe(time.Since(start).Seconds())
|
||||||
|
|
||||||
|
if n.closed {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// If a channel exists, it means the node has opened a new
|
// If a channel exists, it means the node has opened a new
|
||||||
// connection. Close the old channel and replace it.
|
// connection. Close the old channel and replace it.
|
||||||
if curr, ok := n.nodes[nodeID]; ok {
|
if curr, ok := n.nodes[nodeID]; ok {
|
||||||
|
@ -96,6 +112,10 @@ func (n *Notifier) RemoveNode(nodeID types.NodeID, c chan<- types.StateUpdate) b
|
||||||
notifierWaitersForLock.WithLabelValues("lock", "remove").Dec()
|
notifierWaitersForLock.WithLabelValues("lock", "remove").Dec()
|
||||||
notifierWaitForLock.WithLabelValues("remove").Observe(time.Since(start).Seconds())
|
notifierWaitForLock.WithLabelValues("remove").Observe(time.Since(start).Seconds())
|
||||||
|
|
||||||
|
if n.closed {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
if len(n.nodes) == 0 {
|
if len(n.nodes) == 0 {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
@ -154,6 +174,10 @@ func (n *Notifier) NotifyWithIgnore(
|
||||||
update types.StateUpdate,
|
update types.StateUpdate,
|
||||||
ignoreNodeIDs ...types.NodeID,
|
ignoreNodeIDs ...types.NodeID,
|
||||||
) {
|
) {
|
||||||
|
if n.closed {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
notifierUpdateReceived.WithLabelValues(update.Type.String(), types.NotifyOriginKey.Value(ctx)).Inc()
|
notifierUpdateReceived.WithLabelValues(update.Type.String(), types.NotifyOriginKey.Value(ctx)).Inc()
|
||||||
n.b.addOrPassthrough(update)
|
n.b.addOrPassthrough(update)
|
||||||
}
|
}
|
||||||
|
@ -170,6 +194,10 @@ func (n *Notifier) NotifyByNodeID(
|
||||||
notifierWaitersForLock.WithLabelValues("lock", "notify").Dec()
|
notifierWaitersForLock.WithLabelValues("lock", "notify").Dec()
|
||||||
notifierWaitForLock.WithLabelValues("notify").Observe(time.Since(start).Seconds())
|
notifierWaitForLock.WithLabelValues("notify").Observe(time.Since(start).Seconds())
|
||||||
|
|
||||||
|
if n.closed {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
if c, ok := n.nodes[nodeID]; ok {
|
if c, ok := n.nodes[nodeID]; ok {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
|
@ -205,6 +233,10 @@ func (n *Notifier) sendAll(update types.StateUpdate) {
|
||||||
notifierWaitersForLock.WithLabelValues("lock", "send-all").Dec()
|
notifierWaitersForLock.WithLabelValues("lock", "send-all").Dec()
|
||||||
notifierWaitForLock.WithLabelValues("send-all").Observe(time.Since(start).Seconds())
|
notifierWaitForLock.WithLabelValues("send-all").Observe(time.Since(start).Seconds())
|
||||||
|
|
||||||
|
if n.closed {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
for id, c := range n.nodes {
|
for id, c := range n.nodes {
|
||||||
// Whenever an update is sent to all nodes, there is a chance that the node
|
// Whenever an update is sent to all nodes, there is a chance that the node
|
||||||
// has disconnected and the goroutine that was supposed to consume the update
|
// has disconnected and the goroutine that was supposed to consume the update
|
||||||
|
|
Loading…
Reference in New Issue