Merge pull request #44 from andrewhsu/sup
[18.09] Fix supervisor healthcheck throttling
diff --git a/libcontainerd/supervisor/remote_daemon.go b/libcontainerd/supervisor/remote_daemon.go
index b520d48..182984a 100644
--- a/libcontainerd/supervisor/remote_daemon.go
+++ b/libcontainerd/supervisor/remote_daemon.go
@@ -245,20 +245,26 @@
}()
for {
- select {
- case <-ctx.Done():
- r.logger.Info("stopping healthcheck following graceful shutdown")
- if client != nil {
- client.Close()
+ if delay != nil {
+ select {
+ case <-ctx.Done():
+ r.logger.Info("stopping healthcheck following graceful shutdown")
+ if client != nil {
+ client.Close()
+ }
+ return
+ case <-delay:
}
- return
- case <-delay:
- default:
}
if r.daemonPid == -1 {
if r.daemonWaitCh != nil {
- <-r.daemonWaitCh
+ select {
+ case <-ctx.Done():
+ r.logger.Info("stopping containerd startup following graceful shutdown")
+ return
+ case <-r.daemonWaitCh:
+ }
}
os.RemoveAll(r.GRPC.Address)
@@ -276,26 +282,28 @@
}
}
- tctx, cancel := context.WithTimeout(ctx, healthCheckTimeout)
- _, err := client.IsServing(tctx)
- cancel()
- if err == nil {
- if !started {
- close(r.daemonStartCh)
- started = true
+ if client != nil {
+ tctx, cancel := context.WithTimeout(ctx, healthCheckTimeout)
+ _, err := client.IsServing(tctx)
+ cancel()
+ if err == nil {
+ if !started {
+ close(r.daemonStartCh)
+ started = true
+ }
+
+ transientFailureCount = 0
+ delay = time.After(500 * time.Millisecond)
+ continue
}
- transientFailureCount = 0
- delay = time.After(500 * time.Millisecond)
- continue
- }
+ r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding")
- r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding")
-
- transientFailureCount++
- if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) {
- delay = time.After(time.Duration(transientFailureCount) * 200 * time.Millisecond)
- continue
+ transientFailureCount++
+ if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) {
+ delay = time.After(time.Duration(transientFailureCount) * 200 * time.Millisecond)
+ continue
+ }
}
if system.IsProcessAlive(r.daemonPid) {
@@ -304,6 +312,7 @@
}
client.Close()
+ client = nil
r.daemonPid = -1
delay = nil
transientFailureCount = 0