blob: b560a3615f7b588017b79e94b7e0ea68db1250c6 [file] [log] [blame]
// Copyright 2019 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
package main
import (
"context"
"encoding/json"
"flag"
"fmt"
"log"
"net"
"os"
"os/exec"
"time"
devicePkg "go.fuchsia.dev/infra/devices"
"go.fuchsia.dev/tools/netboot"
)
const usage = `usage: health_checker [options]
Checks the health of the attached device by checking to see if it can
discover and ping the device's netsvc address. A healthy device should be
running in Zedboot.
`
// Command line flag values
var (
timeout time.Duration
configFile string
rebootIfUnhealthy bool
forceReboot bool
)
const (
healthyState = "healthy"
unhealthyState = "unhealthy"
logFile = "/tmp/health_checker.log"
)
// DeviceHealthProperties contains health properties of a hardware device.
type HealthCheckResult struct {
// Nodename is the hostname of the device that we want to boot on.
Nodename string `json:"nodename"`
// State is the health status of the device (either "healthy" or "unhealthy").
State string `json:"state"`
// ErrorMsg is the error message provided by the health check.
ErrorMsg string `json:"error_msg"`
}
func pingZedboot(n *netboot.Client, nodename string) error {
netsvcAddr, err := n.Discover(nodename, false)
if err != nil {
return fmt.Errorf("Failed to discover netsvc addr: %v.", err)
}
netsvcIpAddr := &net.IPAddr{IP: netsvcAddr.IP, Zone: netsvcAddr.Zone}
cmd := exec.Command("ping", "-6", netsvcIpAddr.String(), "-c", "1")
if _, err = cmd.Output(); err != nil {
return fmt.Errorf("Failed to ping netsvc addr %s: %v.", netsvcIpAddr, err)
}
return nil
}
func ensureNotFuchsia(n *netboot.Client, nodename string) error {
fuchsiaAddr, err := n.Discover(nodename, true)
if err != nil {
return fmt.Errorf("Failed to discover fuchsia addr: %v.", err)
}
fuchsiaIpAddr := &net.IPAddr{IP: fuchsiaAddr.IP, Zone: fuchsiaAddr.Zone}
cmd := exec.Command("ping", "-6", fuchsiaIpAddr.String(), "-c", "1")
if _, err = cmd.Output(); err == nil {
return fmt.Errorf("Device is in Fuchsia, should be in Zedboot.")
}
return nil
}
func deviceInZedboot(n *netboot.Client, nodename string) error {
if err := pingZedboot(n, nodename); err != nil {
return err
}
if err := ensureNotFuchsia(n, nodename); err != nil {
return err
}
return nil
}
// checkSerial sends an echo command over serial and ensures that the proper
// response is received - is a no-op if the device doesn't have serial
// this is also a no-op for everything other than NUC, as the check seems flaky
// on astros/sherlocks
func checkSerial(device devicePkg.Device) error {
if _, ok := device.(*devicePkg.Nuc); !ok {
return nil
}
cmdString := "echo hello"
resultString := "\r\n$ echo hello\r\nhello"
if err := device.SendSerialCommand(context.Background(), cmdString); err != nil {
return err
}
buffer := make([]byte, len(resultString))
if err := device.ReadSerialData(context.Background(), buffer); err != nil {
return err
}
if string(buffer) != resultString {
log.Printf("serial test got unexpected output: %s", string(buffer))
return fmt.Errorf("serial test got unexpected output")
}
return nil
}
// checkBroadcasting ensures that broadcast packets are being sent by the device
// is a no-op on NUCs
func checkBroadcasting(n *netboot.Client, device devicePkg.Device) error {
if _, ok := device.(*devicePkg.ArmCDCether); !ok {
return nil
}
if _, err := n.Beacon(); err != nil {
return err
}
return nil
}
func checkHealth(n *netboot.Client, device devicePkg.Device) HealthCheckResult {
nodename := device.Nodename()
log.Printf("Checking health for %s", nodename)
// Check the device is in zedboot.
if err := deviceInZedboot(n, nodename); err != nil {
return HealthCheckResult{nodename, unhealthyState, err.Error()}
}
// Check the device is responding to serial. Is a no-op if serial line doesn't exist.
if err := checkSerial(device); err != nil {
return HealthCheckResult{nodename, unhealthyState, err.Error()}
}
// Check the device is broadcasting. Is a no-op on NUCs.
if err := checkBroadcasting(n, device); err != nil {
return HealthCheckResult{nodename, unhealthyState, err.Error()}
}
return HealthCheckResult{nodename, healthyState, ""}
}
func printHealthCheckResults(checkResults []HealthCheckResult) error {
output, err := json.Marshal(checkResults)
if err != nil {
return err
}
fmt.Println(string(output))
return nil
}
func init() {
flag.Usage = func() {
fmt.Fprint(os.Stderr, usage)
flag.PrintDefaults()
}
// First set the flags ...
flag.StringVar(&configFile, "config", "/etc/catalyst/config.json",
"The path of the json config file that contains the nodename of the device.")
flag.DurationVar(&timeout, "timeout", 10*time.Second,
"The timeout for checking each device. The format should be a value acceptable to time.ParseDuration.")
flag.BoolVar(&rebootIfUnhealthy, "reboot", false, "If true, attempt to reboot the device if unhealthy.")
flag.BoolVar(&forceReboot, "force-reboot", false, "If true, will skip health checks and reboot the device.")
}
func main() {
flag.Parse()
client := netboot.NewClient(timeout)
ctx := context.Background()
configs, err := devicePkg.LoadDeviceConfigs(configFile)
if err != nil {
log.Fatal(err)
}
devices, err := devicePkg.CreateDevices(ctx, configs, nil)
if err != nil {
log.Fatal(err)
}
f, err := os.OpenFile(logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
log.Fatal(err)
}
defer f.Close()
log.SetOutput(f)
if forceReboot {
for _, device := range devices {
log.Printf("attempting forced device restart for: %s", device.Nodename())
if err := device.Powercycle(ctx); err == nil {
time.Sleep(1 * time.Minute)
} else {
log.Printf("powercycle failed: %v", err)
}
if device.HasSerial() {
if err := device.SoftReboot(ctx, "R", "serial"); err != nil {
log.Printf("serial reboot failed: %v", err)
}
} else {
if err := device.SoftReboot(ctx, "R", "ssh"); err != nil {
log.Printf("ssh reboot failed: %v", err)
}
}
log.Printf("forced restart for device %s is complete", device.Nodename())
}
return
}
var checkResultSlice []HealthCheckResult
for _, device := range devices {
checkResult := checkHealth(client, device)
log.Printf("state=%s, error_msg=%s", checkResult.State, checkResult.ErrorMsg)
if checkResult.State == unhealthyState && rebootIfUnhealthy {
if err := device.Powercycle(ctx); err == nil {
time.Sleep(1 * time.Minute)
} else {
log.Printf("powercycle failed: %v", err)
}
if device.HasSerial() {
if err := device.SoftReboot(ctx, "R", "serial"); err != nil {
log.Printf("serial reboot failed: %v", err)
}
} else {
if err := device.SoftReboot(ctx, "R", "ssh"); err != nil {
log.Printf("ssh reboot failed: %v", err)
}
}
}
checkResultSlice = append(checkResultSlice, checkResult)
}
if err = printHealthCheckResults(checkResultSlice); err != nil {
log.Fatal(err)
}
}