[health checker][catalyst] Remove extra reboots. This change does several things: 1) Remove reboots from catalyst, as health checker does this after task anyway. 2) Have the "force-reboot" option in health checker attempt a soft reboot so that proper health checks can skip to powercycling. 3) Have health checks skip soft reboots. Bug:44269 Change-Id: I27cc8559516037eaa592de93a1c7c3773bc96c7a

commit: a4b37878529202f054930707c2f98c5ffe44e9f4 [log] [tgz]
author: Anirudh Mathukumilli <rudymathu@google.com> Tue Jan 21 17:25:31 2020 -0800
committer: Anirudh Mathukumilli <rudymathu@google.com> Tue Jan 21 17:30:53 2020 -0800
tree: 9df309e76932be2724cde0b5d3a9e7ae39ae9226
parent: add538dc4d1bff04ad3ecbf996ebed46b59daed6 [diff]
diff --git a/cmd/catalyst/main.go b/cmd/catalyst/main.go
index 28e0577..7dcab63 100644
--- a/cmd/catalyst/main.go
+++ b/cmd/catalyst/main.go

@@ -46,21 +46,6 @@
 	flag.StringVar(&bootserverPath, "bootserver", "", "Path to the bootserver binary")
 }
 
-func rebootDevices(ctx context.Context, devices []*devicePkg.DeviceTarget) {
-	errs := make(chan error)
-	defer close(errs)
-	for _, device := range devices {
-		go func() {
-			errs <- device.Restart(ctx)
-		}()
-	}
-	for i := 0; i < len(devices); i++ {
-		if err := <-errs; err != nil {
-			log.Printf("%s\n", err)
-		}
-	}
-}
-
 // Runs a subprocess and sets up a handler that propagates SIGTERM on context cancel.
 func runSubprocess(ctx context.Context, command []string) int {
 	if len(command) == 0 {
@@ -263,7 +248,6 @@
 				log.Printf("NUC server shutdown failed: %v", err)
 			}
 		}
-		rebootDevices(ctx, devices)
 	}()
 
 	if exitCode := runBootservers(ctx, devices); exitCode != 0 {

diff --git a/cmd/health_checker/main.go b/cmd/health_checker/main.go
index 079756a..72143ce 100644
--- a/cmd/health_checker/main.go
+++ b/cmd/health_checker/main.go

@@ -37,8 +37,6 @@
 	healthyState         = "healthy"
 	unhealthyState       = "unhealthy"
 	logFile              = "/tmp/health_checker.log"
-	zedbootWaitDuration  = 1 * time.Minute
-	zedbootCheckInterval = 10 * time.Second
 )
 
 // DeviceHealthProperties contains health properties of a hardware device.
@@ -89,20 +87,6 @@
 	return nil
 }
 
-func waitOnZedboot(n *netboot.Client, nodename string) error {
-	log.Printf("Waiting for Zedboot to come back up.")
-	start := time.Now()
-	for time.Since(start) < zedbootWaitDuration {
-		time.Sleep(zedbootCheckInterval)
-		if err := deviceInZedboot(n, nodename); err == nil {
-			return nil
-		} else {
-			log.Printf("%s is not in zedboot yet: %s", nodename, err)
-		}
-	}
-	return fmt.Errorf("Device did not come back into zedboot.")
-}
-
 func checkHealth(n *netboot.Client, nodename string) HealthCheckResult {
 	log.Printf("Checking health for %s", nodename)
 	if err := deviceInZedboot(n, nodename); err != nil {
@@ -135,27 +119,6 @@
 	flag.BoolVar(&forceReboot, "force-reboot", false, "If true, will skip health checks and reboot the device.")
 }
 
-func attemptReboot(ctx context.Context, n *netboot.Client, device *devicePkg.DeviceTarget) error {
-	log.Printf("Attempting reboot for %s", device.Nodename())
-
-	// Attempt to restart the device via serial/SSH. If the restart succeeds,
-	// wait for zedboot to come up. If the restart fails, continue to powercycle.
-	if err := device.Restart(ctx); err != nil {
-		log.Printf("Error soft rebooting device: %s", err)
-	} else if err := waitOnZedboot(n, device.Nodename()); err == nil {
-		// Wait for Zedboot to come back up. If it comes back up, we're done.
-		return nil
-	}
-
-	// If we get here, Zedboot never came back up, so we need to powercycle the device.
-	// Powercycle also sends dm reboot-recovery, so the device should come back in zedboot.
-	log.Printf("Zedboot did not come up; attempting powercycle")
-	if err := device.Powercycle(ctx); err != nil {
-		return err
-	}
-	return waitOnZedboot(n, device.Nodename())
-}
-
 func main() {
 	flag.Parse()
 	client := netboot.NewClient(timeout)
@@ -174,9 +137,11 @@
 
 	if forceReboot {
 		for _, device := range devices {
-			if err := attemptReboot(ctx, client, device); err != nil {
-				log.Printf("force reboot failed with error: %s", err.Error())
+			log.Printf("attempting forced device restart for: %s", device.Nodename())
+			if err := device.Restart(ctx); err != nil {
+				log.Printf("forced restart failed with error: %s", err.Error())
 			}
+			log.Printf("forced restart for device %s is complete", device.Nodename())
 		}
 		return
 	}
@@ -186,7 +151,7 @@
 		checkResult := checkHealth(client, device.Nodename())
 		log.Printf("state=%s, error_msg=%s", checkResult.State, checkResult.ErrorMsg)
 		if checkResult.State == unhealthyState && rebootIfUnhealthy {
-			if err := attemptReboot(ctx, client, device); err != nil {
+			if err := device.Powercycle(ctx); err != nil {
 				log.Printf("reboot failed with error: %s", err.Error())
 				checkResult.ErrorMsg += "; " + err.Error()
 			} else {
commit	a4b37878529202f054930707c2f98c5ffe44e9f4	[log] [tgz]
author	Anirudh Mathukumilli <rudymathu@google.com>	Tue Jan 21 17:25:31 2020 -0800
committer	Anirudh Mathukumilli <rudymathu@google.com>	Tue Jan 21 17:30:53 2020 -0800
tree	9df309e76932be2724cde0b5d3a9e7ae39ae9226
parent	add538dc4d1bff04ad3ecbf996ebed46b59daed6 [diff]