[testrunner] Make execution over SSH more resilient. This adds logic to make a health check of the underlying SSH client and replace it if need be. Bug: INTK-894 #comment Change-Id: I8f938c1abb41242fa0b3067e665b38d1a0e2b690

commit: ec0103c4cd24873a04fa1800540f3a6ac9e0b76b [log] [tgz]
author: Joshua Seaton <joshuaseaton@google.com> Sun Mar 31 23:14:57 2019 +0000
committer: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Sun Mar 31 23:14:57 2019 +0000
tree: 8fb3718009224209a8334f7bcbb1c27b935b150d
parent: 04b1a39b7dfe37bf8d3188c3512c6d8e155549b2 [diff]
diff --git a/cmd/testrunner/tester.go b/cmd/testrunner/tester.go
index 369f52b..150ee96 100644
--- a/cmd/testrunner/tester.go
+++ b/cmd/testrunner/tester.go

@@ -10,6 +10,7 @@
 	"io"
 	"path"
 
+	"fuchsia.googlesource.com/tools/logger"
 	"fuchsia.googlesource.com/tools/runner"
 	"fuchsia.googlesource.com/tools/sshutil"
 	"fuchsia.googlesource.com/tools/testsharder"
@@ -19,6 +20,10 @@
 const (
 	// The test output directory to create on the Fuchsia device.
 	fuchsiaOutputDir = "/data/infra/testrunner"
+
+	// A conventionally used global request name for checking the status of a client
+	// connection to an OpenSSH server.
+	keepAliveOpenSSH = "keepalive@openssh.com"
 )
 
 // Tester is executes a Test.
@@ -51,22 +56,29 @@
 // contains the command line to execute on the remote machine. The caller should Close() the
 // tester when finished. Once closed, this object can no longer be used.
 type SSHTester struct {
-	client *ssh.Client
+	client    *ssh.Client
+	newClient func(ctx context.Context) (*ssh.Client, error)
 }
 
-func NewSSHTester(nodename string, sshKey []byte) (*SSHTester, error) {
-	config, err := sshutil.DefaultSSHConfig(sshKey)
+func NewSSHTester(newClient func(context.Context) (*ssh.Client, error)) (*SSHTester, error) {
+	client, err := newClient(context.Background())
 	if err != nil {
-		return nil, fmt.Errorf("failed to create an SSH client config: %v", err)
+		return nil, err
 	}
-	client, err := sshutil.ConnectToNode(context.Background(), nodename, config)
-	if err != nil {
-		return nil, fmt.Errorf("failed to connect to node %q: %v", nodename, err)
-	}
-	return &SSHTester{client: client}, nil
+	return &SSHTester{client: client, newClient: newClient}, nil
 }
 
 func (t *SSHTester) Test(ctx context.Context, test testsharder.Test, stdout io.Writer, stderr io.Writer) error {
+	if _, _, err := t.client.Conn.SendRequest(keepAliveOpenSSH, true, nil); err != nil {
+		logger.Errorf(ctx, "SSH client not responsive: %v", err)
+		client, err := t.newClient(ctx)
+		if err != nil {
+			return fmt.Errorf("failed to create new SSH client: %v", err)
+		}
+		t.client.Close()
+		t.client = client
+	}
+
 	session, err := t.client.NewSession()
 	if err != nil {
 		return err
@@ -98,7 +110,19 @@
 // NewFuchsiaTester creates a FuchsiaTester object and starts a log_listener process on
 // the remote device. The log_listener output can be read from SysLogOutput().
 func NewFuchsiaTester(nodename string, sshKey []byte) (*FuchsiaTester, error) {
-	delegate, err := NewSSHTester(nodename, sshKey)
+	newClient := func(ctx context.Context) (*ssh.Client, error) {
+		config, err := sshutil.DefaultSSHConfig(sshKey)
+		if err != nil {
+			return nil, fmt.Errorf("failed to create an SSH client config: %v", err)
+		}
+		client, err := sshutil.ConnectToNode(ctx, nodename, config)
+		if err != nil {
+			return nil, fmt.Errorf("failed to connect to node %q: %v", nodename, err)
+		}
+		return client, nil
+	}
+
+	delegate, err := NewSSHTester(newClient)
 	if err != nil {
 		return nil, err
 	}

diff --git a/cmd/testrunner/tester_test.go b/cmd/testrunner/tester_test.go
index c393df8..abe295c 100644
--- a/cmd/testrunner/tester_test.go
+++ b/cmd/testrunner/tester_test.go

@@ -7,12 +7,16 @@
 import (
 	"bytes"
 	"context"
+	"fmt"
 	"io/ioutil"
 	"os"
 	"strings"
 	"testing"
 
+	"fuchsia.googlesource.com/tools/sshutil"
 	"fuchsia.googlesource.com/tools/testsharder"
+
+	"golang.org/x/crypto/ssh"
 )
 
 func TestTester(t *testing.T) {
@@ -118,7 +122,19 @@
 
 	for _, tt := range cases {
 		t.Run(tt.name, func(t *testing.T) {
-			tester, err := NewSSHTester(nodename, sshKey)
+			newClient := func(ctx context.Context) (*ssh.Client, error) {
+				config, err := sshutil.DefaultSSHConfig(sshKey)
+				if err != nil {
+					return nil, fmt.Errorf("failed to create an SSH client config: %v", err)
+				}
+				client, err := sshutil.ConnectToNode(ctx, nodename, config)
+				if err != nil {
+					return nil, fmt.Errorf("failed to connect to node %q: %v", nodename, err)
+				}
+				return client, nil
+			}
+
+			tester, err := NewSSHTester(newClient)
 			if err != nil {
 				t.Errorf("failed to intialize tester: %v", err)
 				return
commit	ec0103c4cd24873a04fa1800540f3a6ac9e0b76b	[log] [tgz]
author	Joshua Seaton <joshuaseaton@google.com>	Sun Mar 31 23:14:57 2019 +0000
committer	CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>	Sun Mar 31 23:14:57 2019 +0000
tree	8fb3718009224209a8334f7bcbb1c27b935b150d
parent	04b1a39b7dfe37bf8d3188c3512c6d8e155549b2 [diff]