[health checker][catalyst] Remove extra reboots.
This change does several things:
1) Remove reboots from catalyst, as health checker does this after
task anyway.
2) Have the "force-reboot" option in health checker attempt a soft
reboot so that proper health checks can skip to powercycling.
3) Have health checks skip soft reboots.
Bug:44269
Change-Id: I27cc8559516037eaa592de93a1c7c3773bc96c7a
diff --git a/cmd/catalyst/main.go b/cmd/catalyst/main.go
index 28e0577..7dcab63 100644
--- a/cmd/catalyst/main.go
+++ b/cmd/catalyst/main.go
@@ -46,21 +46,6 @@
flag.StringVar(&bootserverPath, "bootserver", "", "Path to the bootserver binary")
}
-func rebootDevices(ctx context.Context, devices []*devicePkg.DeviceTarget) {
- errs := make(chan error)
- defer close(errs)
- for _, device := range devices {
- go func() {
- errs <- device.Restart(ctx)
- }()
- }
- for i := 0; i < len(devices); i++ {
- if err := <-errs; err != nil {
- log.Printf("%s\n", err)
- }
- }
-}
-
// Runs a subprocess and sets up a handler that propagates SIGTERM on context cancel.
func runSubprocess(ctx context.Context, command []string) int {
if len(command) == 0 {
@@ -263,7 +248,6 @@
log.Printf("NUC server shutdown failed: %v", err)
}
}
- rebootDevices(ctx, devices)
}()
if exitCode := runBootservers(ctx, devices); exitCode != 0 {
diff --git a/cmd/health_checker/main.go b/cmd/health_checker/main.go
index 079756a..72143ce 100644
--- a/cmd/health_checker/main.go
+++ b/cmd/health_checker/main.go
@@ -37,8 +37,6 @@
healthyState = "healthy"
unhealthyState = "unhealthy"
logFile = "/tmp/health_checker.log"
- zedbootWaitDuration = 1 * time.Minute
- zedbootCheckInterval = 10 * time.Second
)
// DeviceHealthProperties contains health properties of a hardware device.
@@ -89,20 +87,6 @@
return nil
}
-func waitOnZedboot(n *netboot.Client, nodename string) error {
- log.Printf("Waiting for Zedboot to come back up.")
- start := time.Now()
- for time.Since(start) < zedbootWaitDuration {
- time.Sleep(zedbootCheckInterval)
- if err := deviceInZedboot(n, nodename); err == nil {
- return nil
- } else {
- log.Printf("%s is not in zedboot yet: %s", nodename, err)
- }
- }
- return fmt.Errorf("Device did not come back into zedboot.")
-}
-
func checkHealth(n *netboot.Client, nodename string) HealthCheckResult {
log.Printf("Checking health for %s", nodename)
if err := deviceInZedboot(n, nodename); err != nil {
@@ -135,27 +119,6 @@
flag.BoolVar(&forceReboot, "force-reboot", false, "If true, will skip health checks and reboot the device.")
}
-func attemptReboot(ctx context.Context, n *netboot.Client, device *devicePkg.DeviceTarget) error {
- log.Printf("Attempting reboot for %s", device.Nodename())
-
- // Attempt to restart the device via serial/SSH. If the restart succeeds,
- // wait for zedboot to come up. If the restart fails, continue to powercycle.
- if err := device.Restart(ctx); err != nil {
- log.Printf("Error soft rebooting device: %s", err)
- } else if err := waitOnZedboot(n, device.Nodename()); err == nil {
- // Wait for Zedboot to come back up. If it comes back up, we're done.
- return nil
- }
-
- // If we get here, Zedboot never came back up, so we need to powercycle the device.
- // Powercycle also sends dm reboot-recovery, so the device should come back in zedboot.
- log.Printf("Zedboot did not come up; attempting powercycle")
- if err := device.Powercycle(ctx); err != nil {
- return err
- }
- return waitOnZedboot(n, device.Nodename())
-}
-
func main() {
flag.Parse()
client := netboot.NewClient(timeout)
@@ -174,9 +137,11 @@
if forceReboot {
for _, device := range devices {
- if err := attemptReboot(ctx, client, device); err != nil {
- log.Printf("force reboot failed with error: %s", err.Error())
+ log.Printf("attempting forced device restart for: %s", device.Nodename())
+ if err := device.Restart(ctx); err != nil {
+ log.Printf("forced restart failed with error: %s", err.Error())
}
+ log.Printf("forced restart for device %s is complete", device.Nodename())
}
return
}
@@ -186,7 +151,7 @@
checkResult := checkHealth(client, device.Nodename())
log.Printf("state=%s, error_msg=%s", checkResult.State, checkResult.ErrorMsg)
if checkResult.State == unhealthyState && rebootIfUnhealthy {
- if err := attemptReboot(ctx, client, device); err != nil {
+ if err := device.Powercycle(ctx); err != nil {
log.Printf("reboot failed with error: %s", err.Error())
checkResult.ErrorMsg += "; " + err.Error()
} else {