[system-update-committer] implement wait_and_reboot

This change adds reboot functionality to the system-update-committer.
Generally, if we fail to put the boot metadata in a happy state, we'll
reboot. Exception: if it's a blobfs verify failure, and the config says
to ignore, we won't reboot.

Note: we can't yet integration test the exception because currently the
blobfs verification will always pass.

Fixed: 64600
Test: fx test -o system-update-committer-tests
Test: fx test -o system-update-committer-integration-tests
Change-Id: Ic48be5220870c31ab0450cd2ea19f46ddaef6021
Reviewed-on: https://fuchsia-review.googlesource.com/c/fuchsia/+/475560
Commit-Queue: Zach Kirschenbaum <zkbaum@google.com>
Reviewed-by: Ben Keller <galbanum@google.com>
diff --git a/src/sys/pkg/bin/system-update-committer/BUILD.gn b/src/sys/pkg/bin/system-update-committer/BUILD.gn
index 7eb1daf..7ea3124 100644
--- a/src/sys/pkg/bin/system-update-committer/BUILD.gn
+++ b/src/sys/pkg/bin/system-update-committer/BUILD.gn
@@ -14,6 +14,7 @@
   edition = "2018"
 
   deps = [
+    "//sdk/fidl/fuchsia.hardware.power.statecontrol:fuchsia.hardware.power.statecontrol-rustc",
     "//sdk/fidl/fuchsia.paver:fuchsia.paver-rustc",
     "//sdk/fidl/fuchsia.update:fuchsia.update-rustc",
     "//src/lib/fidl/rust/fidl",
@@ -30,8 +31,10 @@
   ]
   test_deps = [
     "//src/sys/pkg/testing/mock-paver",
+    "//src/sys/pkg/testing/mock-reboot",
     "//src/sys/pkg/testing/mock-verifier:lib",
     "//third_party/rust_crates:matches",
+    "//third_party/rust_crates:parking_lot",
     "//third_party/rust_crates:proptest",
     "//third_party/rust_crates:proptest-derive",
   ]
@@ -47,6 +50,7 @@
     "src/metadata/errors.rs",
     "src/metadata/policy.rs",
     "src/metadata/verify.rs",
+    "src/reboot.rs",
   ]
 
   visibility = [
diff --git a/src/sys/pkg/bin/system-update-committer/meta/system-update-committer.cml b/src/sys/pkg/bin/system-update-committer/meta/system-update-committer.cml
index 3215955..a951c57 100644
--- a/src/sys/pkg/bin/system-update-committer/meta/system-update-committer.cml
+++ b/src/sys/pkg/bin/system-update-committer/meta/system-update-committer.cml
@@ -8,7 +8,12 @@
     ],
     use: [
         { runner: "elf" },
-        { protocol: "fuchsia.paver.Paver" },
+        {
+            protocol: [
+                "fuchsia.hardware.power.statecontrol.Admin",
+                "fuchsia.paver.Paver",
+            ],
+        },
     ],
     expose: [
         {
diff --git a/src/sys/pkg/bin/system-update-committer/src/main.rs b/src/sys/pkg/bin/system-update-committer/src/main.rs
index 655417a..5575d5f 100644
--- a/src/sys/pkg/bin/system-update-committer/src/main.rs
+++ b/src/sys/pkg/bin/system-update-committer/src/main.rs
@@ -3,21 +3,36 @@
 // found in the LICENSE file.
 
 use {
-    crate::fidl::FidlServer,
+    crate::{
+        fidl::FidlServer,
+        metadata::put_metadata_in_happy_state,
+        reboot::{should_reboot, wait_and_reboot},
+    },
     anyhow::{anyhow, Context, Error},
     config::Config,
+    fidl_fuchsia_hardware_power_statecontrol::AdminMarker as PowerStateControlMarker,
     fidl_fuchsia_paver::PaverMarker,
     fuchsia_async as fasync,
     fuchsia_component::server::ServiceFs,
     fuchsia_syslog::{fx_log_info, fx_log_warn},
     fuchsia_zircon::{self as zx, HandleBased},
     futures::{channel::oneshot, prelude::*, stream::FuturesUnordered},
-    std::sync::Arc,
+    std::{sync::Arc, time::Duration},
 };
 
 mod config;
 mod fidl;
 mod metadata;
+mod reboot;
+
+// The system persists the reboot reason via a component called last_reboot. If we issue a reboot
+// before last_reboot starts, the reboot reason will not persist. Since last_reboot is v1 and the
+// system-update-committer is v2, and v2 components start earlier than v1 components, let's try to
+// minimize this risk by defining a minimum duration we must wait to issue a reboot. Ideally, reboot
+// clients should not have to worry about this. However, given the transition from v1 to v2, for now
+// we mitigate this with a timer. This value was determined experimentally on Astro, where there
+// seems to be a ~2 second gap between the system-update-committer and last_reboot starting.
+const MINIMUM_REBOOT_WAIT: Duration = Duration::from_secs(5);
 
 pub fn main() -> Result<(), Error> {
     fuchsia_syslog::init_with_tags(&["system-update-committer"])
@@ -37,9 +52,8 @@
 }
 
 async fn main_inner_async() -> Result<(), Error> {
-    // TODO(http://fxbug.dev/64595) Actually use the configuration once we start implementing
-    // health verification.
-    let _config = Config::load_from_config_data_or_default();
+    let config = Config::load_from_config_data_or_default();
+    let reboot_timer = fasync::Timer::new(MINIMUM_REBOOT_WAIT);
 
     let paver = fuchsia_component::client::connect_to_service::<PaverMarker>()
         .context("while connecting to paver")?;
@@ -50,6 +64,9 @@
         .find_boot_manager(boot_manager_server_end)
         .context("transport error while calling find_boot_manager()")?;
 
+    let reboot_proxy = fuchsia_component::client::connect_to_service::<PowerStateControlMarker>()
+        .context("while connecting to power state control")?;
+
     let futures = FuturesUnordered::new();
     let (p_internal, p_external) = zx::EventPair::create().context("while creating EventPairs")?;
 
@@ -60,16 +77,25 @@
 
     let (unblocker, blocker) = oneshot::channel();
 
-    // Handle putting boot metadata in happy state.
+    // Handle putting boot metadata in happy state and rebooting on failure (if necessary).
     futures.push(
         async move {
-            // TODO(http://fxbug.dev/64595) combine the config and the result of
-            // put_metadata_in_happy_state to determine if we should reboot. For now, we just log.
-            if let Err(e) =
-                crate::metadata::put_metadata_in_happy_state(&boot_manager, &p_internal, unblocker)
-                    .await
+            if let Err(e) = put_metadata_in_happy_state(&boot_manager, &p_internal, unblocker).await
             {
-                fx_log_warn!("error putting boot metadata in happy state: {:#}", anyhow!(e));
+                if should_reboot(&e, &config) {
+                    fx_log_warn!(
+                        "Failed to put metadata in happy state. Rebooting given error {:#} and config {:?}",
+                        anyhow!(e),
+                        config
+                    );
+                    wait_and_reboot(&reboot_proxy, reboot_timer).await;
+                } else {
+                    fx_log_warn!(
+                        "Failed to put metadata in happy state. NOT rebooting given error {:#} and config {:?}",
+                        anyhow!(e),
+                        config
+                    );
+                }
             }
         }
         .boxed_local(),
diff --git a/src/sys/pkg/bin/system-update-committer/src/metadata.rs b/src/sys/pkg/bin/system-update-committer/src/metadata.rs
index 1418d20..bc164c2 100644
--- a/src/sys/pkg/bin/system-update-committer/src/metadata.rs
+++ b/src/sys/pkg/bin/system-update-committer/src/metadata.rs
@@ -6,7 +6,6 @@
 
 use {
     commit::do_commit,
-    errors::MetadataError,
     fidl_fuchsia_paver as paver,
     fuchsia_zircon::{self as zx, EventPair, Peered},
     futures::channel::oneshot,
@@ -14,6 +13,11 @@
     verify::do_health_verification,
 };
 
+pub(super) use errors::{MetadataError, VerifyError};
+
+#[cfg(test)]
+pub(super) use errors::{BootManagerError, PolicyError, VerifyFailureReason};
+
 mod commit;
 mod configuration;
 mod errors;
diff --git a/src/sys/pkg/bin/system-update-committer/src/reboot.rs b/src/sys/pkg/bin/system-update-committer/src/reboot.rs
new file mode 100644
index 0000000..2c21035
--- /dev/null
+++ b/src/sys/pkg/bin/system-update-committer/src/reboot.rs
@@ -0,0 +1,165 @@
+// Copyright 2021 The Fuchsia Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+use {
+    crate::{
+        config::{Config, Mode},
+        metadata::{MetadataError, VerifyError},
+    },
+    anyhow::{anyhow, Context},
+    fidl_fuchsia_hardware_power_statecontrol::{
+        AdminProxy as PowerStateControlProxy, RebootReason,
+    },
+    fuchsia_async as fasync,
+    fuchsia_syslog::fx_log_err,
+    fuchsia_zircon::Status,
+};
+
+/// Determines if we should reboot.
+pub(super) fn should_reboot(error: &MetadataError, config: &Config) -> bool {
+    if let MetadataError::Verify(VerifyError::BlobFs(_)) = error {
+        return config.blobfs() == &Mode::RebootOnFailure;
+    }
+    return true;
+}
+
+/// Waits for the timer to complete and then reboots the system, logging errors instead of failing.
+pub(super) async fn wait_and_reboot(proxy: &PowerStateControlProxy, timer: fasync::Timer) {
+    let () = timer.await;
+    if let Err(e) = async move {
+        proxy
+            .reboot(RebootReason::RetrySystemUpdate)
+            .await
+            .context("while performing reboot call")?
+            .map_err(Status::from_raw)
+            .context("reboot responded with")
+    }
+    .await
+    {
+        fx_log_err!("error initiating reboot: {:#}", anyhow!(e));
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use {
+        super::*,
+        crate::metadata::{BootManagerError, PolicyError, VerifyFailureReason},
+        futures::{channel::oneshot, pin_mut, task::Poll},
+        mock_reboot::MockRebootService,
+        parking_lot::Mutex,
+        proptest::prelude::*,
+        std::{sync::Arc, time::Duration},
+    };
+
+    #[test]
+    fn test_wait_and_reboot_success() {
+        let mut executor = fasync::Executor::new_with_fake_time().unwrap();
+
+        // Create a mock reboot service.
+        let (sender, recv) = oneshot::channel();
+        let sender = Arc::new(Mutex::new(Some(sender)));
+        let mock = Arc::new(MockRebootService::new(Box::new(move |reason: RebootReason| {
+            sender.lock().take().unwrap().send(reason).unwrap();
+            Ok(())
+        })));
+        let proxy = mock.spawn_reboot_service();
+
+        // Prepare futures to call reboot and receive the reboot request.
+        let timer_duration = 5;
+        let reboot_fut =
+            wait_and_reboot(&proxy, fasync::Timer::new(Duration::from_secs(timer_duration)));
+        pin_mut!(reboot_fut);
+        pin_mut!(recv);
+
+        // Set the time so that the timer is still going, so we should neither call reboot nor
+        // observe the reboot service was called.
+        executor.set_fake_time(fasync::Time::after(Duration::from_secs(timer_duration - 1).into()));
+        assert!(!executor.wake_expired_timers());
+        match executor.run_until_stalled(&mut reboot_fut) {
+            Poll::Ready(res) => panic!("future unexpectedly completed with response: {:?}", res),
+            Poll::Pending => {}
+        };
+        match executor.run_until_stalled(&mut recv) {
+            Poll::Ready(res) => panic!("future unexpectedly completed with response: {:?}", res),
+            Poll::Pending => {}
+        };
+
+        // Once the timer completes, we should complete the reboot call and observe we called the
+        // reboot service with the given reboot reason.
+        executor.set_fake_time(fasync::Time::after(Duration::from_secs(1).into()));
+        assert!(executor.wake_expired_timers());
+        match executor.run_until_stalled(&mut recv) {
+            Poll::Ready(res) => panic!("future unexpectedly completed with response: {:?}", res),
+            Poll::Pending => {}
+        };
+        match executor.run_until_stalled(&mut reboot_fut) {
+            Poll::Ready(_) => {}
+            Poll::Pending => panic!("future unexpectedly pending"),
+        };
+        match executor.run_until_stalled(&mut recv) {
+            Poll::Ready(res) => assert_eq!(res, Ok(RebootReason::RetrySystemUpdate)),
+            Poll::Pending => panic!("future unexpectedly pending"),
+        };
+    }
+
+    fn test_blobfs_verify_errors(config: Config, expect_reboot: bool) {
+        let timeout_err = MetadataError::Verify(VerifyError::BlobFs(VerifyFailureReason::Timeout));
+        let verify_err =
+            MetadataError::Verify(VerifyError::BlobFs(VerifyFailureReason::Verify(anyhow!("foo"))));
+        let fidl_err = MetadataError::Verify(VerifyError::BlobFs(VerifyFailureReason::Fidl(
+            fidl::Error::OutOfRange,
+        )));
+
+        assert_eq!(should_reboot(&timeout_err, &config), expect_reboot);
+        assert_eq!(should_reboot(&verify_err, &config), expect_reboot);
+        assert_eq!(should_reboot(&fidl_err, &config), expect_reboot);
+    }
+
+    /// Blobfs errors SHOULD NOT cause a reboot if they're ignored.
+    #[test]
+    fn test_blobfs_errors_should_not_reboot() {
+        test_blobfs_verify_errors(Config::builder().blobfs(Mode::Ignore).build(), false);
+    }
+
+    /// Blobfs errors SHOULD cause a reboot if they're NOT ignored.
+    #[test]
+    fn test_blobfs_errors_should_reboot() {
+        test_blobfs_verify_errors(Config::builder().blobfs(Mode::RebootOnFailure).build(), true);
+    }
+
+    // Test that all the non-blobfs metadata errors will always cause a reboot, regardless of the
+    // config. Ideally, we'd also generate arbitrary MetadataErrors, but that's not possible
+    // because there are !Clone descendants.
+    proptest! {
+        #[test]
+        fn test_should_reboot_commit_error(config: Config) {
+            let err = MetadataError::Commit(BootManagerError::Status {
+                method_name: "foo",
+                status: Status::UNAVAILABLE,
+            });
+            assert!(should_reboot(&err, &config));
+        }
+
+        #[test]
+        fn test_should_reboot_policy_error(config: Config) {
+            let err = MetadataError::Policy(PolicyError::Build(BootManagerError::Status {
+                method_name: "bar",
+                status: Status::ACCESS_DENIED,
+            }));
+            assert!(should_reboot(&err, &config));
+        }
+
+        #[test]
+        fn test_should_reboot_signal_error(config: Config) {
+            let err = MetadataError::SignalPeer(fuchsia_zircon::Status::NOT_FOUND);
+            assert!(should_reboot(&err, &config));
+        }
+
+        #[test]
+        fn test_should_reboot_unblock_error(config: Config) {
+            assert!(should_reboot(&MetadataError::Unblock, &config));
+        }
+    }
+}
diff --git a/src/sys/pkg/tests/system-update-committer/BUILD.gn b/src/sys/pkg/tests/system-update-committer/BUILD.gn
index 82b59a7..e7d45d2e 100644
--- a/src/sys/pkg/tests/system-update-committer/BUILD.gn
+++ b/src/sys/pkg/tests/system-update-committer/BUILD.gn
@@ -19,9 +19,11 @@
     "//src/lib/fuchsia-component",
     "//src/lib/zircon/rust:fuchsia-zircon",
     "//src/sys/pkg/testing/mock-paver",
+    "//src/sys/pkg/testing/mock-reboot",
     "//third_party/rust_crates:anyhow",
     "//third_party/rust_crates:futures",
     "//third_party/rust_crates:matches",
+    "//third_party/rust_crates:parking_lot",
   ]
 
   sources = [ "src/lib.rs" ]
diff --git a/src/sys/pkg/tests/system-update-committer/meta/system-update-committer.cmx b/src/sys/pkg/tests/system-update-committer/meta/system-update-committer.cmx
index b8cd5f5..65ccd44 100644
--- a/src/sys/pkg/tests/system-update-committer/meta/system-update-committer.cmx
+++ b/src/sys/pkg/tests/system-update-committer/meta/system-update-committer.cmx
@@ -7,6 +7,7 @@
     },
     "sandbox": {
         "services": [
+            "fuchsia.hardware.power.statecontrol.Admin",
             "fuchsia.paver.Paver"
         ]
     }
diff --git a/src/sys/pkg/tests/system-update-committer/src/lib.rs b/src/sys/pkg/tests/system-update-committer/src/lib.rs
index 83b97a5..c7bc886 100644
--- a/src/sys/pkg/tests/system-update-committer/src/lib.rs
+++ b/src/sys/pkg/tests/system-update-committer/src/lib.rs
@@ -12,9 +12,11 @@
         server::{NestedEnvironment, ServiceFs},
     },
     fuchsia_zircon::{self as zx, AsHandleRef},
-    futures::prelude::*,
+    futures::{channel::oneshot, prelude::*},
     matches::assert_matches,
     mock_paver::{hooks as mphooks, MockPaverService, MockPaverServiceBuilder, PaverEvent},
+    mock_reboot::{MockRebootService, RebootReason},
+    parking_lot::Mutex,
     std::{sync::Arc, time::Duration},
 };
 
@@ -24,12 +26,17 @@
 
 struct TestEnvBuilder {
     paver_service_builder: Option<MockPaverServiceBuilder>,
+    reboot_service: Option<MockRebootService>,
 }
 impl TestEnvBuilder {
     fn paver_service_builder(self, paver_service_builder: MockPaverServiceBuilder) -> Self {
         Self { paver_service_builder: Some(paver_service_builder), ..self }
     }
 
+    fn reboot_service(self, reboot_service: MockRebootService) -> Self {
+        Self { reboot_service: Some(reboot_service), ..self }
+    }
+
     fn build(self) -> TestEnv {
         let mut fs = ServiceFs::new();
         fs.add_proxy_service::<fidl_fuchsia_logger::LogSinkMarker, _>();
@@ -48,6 +55,20 @@
             .detach()
         });
 
+        // Set up reboot service.
+        let reboot_service = Arc::new(self.reboot_service.unwrap_or_else(|| {
+            MockRebootService::new(Box::new(|_| panic!("unexpected call to reboot")))
+        }));
+        let reboot_service_clone = Arc::clone(&reboot_service);
+        fs.add_fidl_service(move |stream| {
+            fasync::Task::spawn(
+                Arc::clone(&reboot_service_clone)
+                    .run_reboot_service(stream)
+                    .unwrap_or_else(|e| panic!("error running reboot service: {:#}", anyhow!(e))),
+            )
+            .detach()
+        });
+
         let env = fs
             .create_salted_nested_environment("system_update_committer_env")
             .expect("nested environment to create successfully");
@@ -57,18 +78,24 @@
             .spawn(env.launcher())
             .expect("system-update-committer to launch");
 
-        TestEnv { _env: env, system_update_committer, _paver_service: paver_service }
+        TestEnv {
+            _env: env,
+            system_update_committer,
+            _paver_service: paver_service,
+            _reboot_service: reboot_service,
+        }
     }
 }
 struct TestEnv {
     _env: NestedEnvironment,
     system_update_committer: App,
     _paver_service: Arc<MockPaverService>,
+    _reboot_service: Arc<MockRebootService>,
 }
 
 impl TestEnv {
     fn builder() -> TestEnvBuilder {
-        TestEnvBuilder { paver_service_builder: None }
+        TestEnvBuilder { paver_service_builder: None, reboot_service: None }
     }
 
     /// Opens a connection to the fuchsia.update/CommitStatusProvider FIDL service.
@@ -212,3 +239,29 @@
         Ok(zx::Signals::USER_0)
     );
 }
+
+/// When the paver fails, we expect the system-update-committer to trigger a reboot.
+#[fasync::run_singlethreaded(test)]
+async fn reboot() {
+    let (sender, recv) = oneshot::channel();
+    let sender = Arc::new(Mutex::new(Some(sender)));
+    let reboot_service = MockRebootService::new(Box::new(move |reason: RebootReason| {
+        sender.lock().take().unwrap().send(reason).unwrap();
+        Ok(())
+    }));
+
+    let _env = TestEnv::builder()
+        .paver_service_builder(MockPaverServiceBuilder::new().insert_hook(mphooks::return_error(
+            |e: &PaverEvent| {
+                if e == &PaverEvent::QueryCurrentConfiguration {
+                    zx::Status::NOT_FOUND
+                } else {
+                    zx::Status::OK
+                }
+            },
+        )))
+        .reboot_service(reboot_service)
+        .build();
+
+    assert_eq!(recv.await, Ok(RebootReason::RetrySystemUpdate));
+}