[cobalt][storage] Add metrics for bad blocks and nand ECC. Metrics to examine the failure bahviours of the nand reads with regard to ECC, to get an idea of how well/badly the fleet is faring. These will be populated by the Nand driver reporting the results of read attempts via Sampler. Metrics for the number of bad blocks in the NDM, this will be reported by the FTL/NDM process at start up and updated after each request along with the existing stats to keep it in sync with the current view. This will also be populated to inspect and reported via Sampler. Bug: 85321 Change-Id: I38ba61892019d206385aa7006f3b4cfa692aaf49 Reviewed-on: https://fuchsia-review.googlesource.com/c/cobalt-registry/+/598026 Reviewed-by: Griffin Boyce <glamrock@google.com> Reviewed-by: Steve Fung <stevefung@google.com> Privacy-Approval: Griffin Boyce <glamrock@google.com> Commit-Queue: Martin Lindsay <mlindsay@google.com> (cherry picked from commit 5d0181eeb6f81802f39146f84958e6dec1b8c37e)

commit: 02d0e1509f8fbbe8141efda444023f0c916474ef [log] [tgz]
author: Martin Lindsay <mlindsay@google.com> Fri Feb 04 20:37:20 2022 +0000
committer: releases-try-builder <releases-try-builder@fuchsia-infra.iam.gserviceaccount.com> Sat Feb 05 00:16:27 2022 +0000
tree: c46553ed4954ae67c51671fa3c8e2c1fb0d23cd7
parent: 70564468182f69818aaa2db04745e7cfa7271d43 [diff]
diff --git a/fuchsia/local_storage/metrics.yaml b/fuchsia/local_storage/metrics.yaml
index 8a88ab5..ab4de27 100644
--- a/fuchsia/local_storage/metrics.yaml
+++ b/fuchsia/local_storage/metrics.yaml

@@ -1758,7 +1758,7 @@
 
 ################################################################################
 
-# FTL Metrics
+# FTL and Nand Metrics
 # Reserved Range: [2000, 3000)
 
 ################################################################################
@@ -2029,7 +2029,190 @@
     max_release_stage: GA
     expiration_date: "2022/02/16"
 
-# Next ID: 2006
+################################################################################
+# Nand bit flips corrected by ECC per call.
+#
+# A histogram of the number of bitflips correct by ECC on a read. Excess bit
+# flips that cannot be corrected will be set as the max number of correctable
+# plus one. For all current use cases, 30 is the max correctable bits.
+#
+# Provides insight into the frequency and extremity of bit flips observed.
+################################################################################
+- id: 2006
+  metric_name: nand_read_ecc_bit_flips
+  metric_type: INTEGER_HISTOGRAM
+  metric_semantics: [USAGE_COUNTING]
+  metric_units_other: "bitflips"
+  int_buckets:
+    linear:
+      # 0, 1, 2... 29, 30, 31
+      step_size: 1
+      num_buckets: 32
+  reports:
+    ############################################################################
+    # A fleet-wide histogram of bit flips on each read.
+    ############################################################################
+    - report_name: nand_ecc_bit_flips_histogram
+      id: 1
+      report_type: FLEETWIDE_HISTOGRAMS
+      privacy_level: NO_ADDED_PRIVACY
+      system_profile_field: [BOARD_NAME, PRODUCT_NAME, SYSTEM_VERSION, CHANNEL]
+  meta_data:
+    max_release_stage: GA
+    expiration_date: "2022/10/20"
+
+################################################################################
+# Repeated read attempts to nand before a successful read, or giving up.
+#
+# A histogram of the number of read attempts used on a single read, always at
+# least 1 for succeeding on the first attempt, and if it gives up it reports
+# MAX_ULONG and ends up in the overflow bucket.
+#
+# For insight into how many consecutive read attempts are actually productive,
+# and how much benefit we get from our current maximum.
+################################################################################
+- id: 2007
+  metric_name: nand_read_attempts_per_read
+  metric_type: INTEGER_HISTOGRAM
+  metric_semantics: [USAGE_COUNTING]
+  metric_units_other: "attempts"
+  int_buckets:
+    exponential:
+      # 0, 1, 2, 4, 8... 64, 128
+      floor: 0
+      num_buckets: 9
+      initial_step: 1
+      step_multiplier: 2
+  reports:
+    ############################################################################
+    # A fleet-wide histogram of the number of attempts required to read.
+    ############################################################################
+    - report_name: attempts_histogram
+      id: 1
+      report_type: FLEETWIDE_HISTOGRAMS
+      privacy_level: NO_ADDED_PRIVACY
+      system_profile_field: [BOARD_NAME, PRODUCT_NAME, SYSTEM_VERSION, CHANNEL]
+  meta_data:
+    max_release_stage: GA
+    expiration_date: "2022/10/20"
+
+################################################################################
+# Number of failed read attempts to the nand.
+#
+# A count of how many read attempts are sent to the nand which result in
+# failure.
+#
+# For insight into how fleetwide failures might be clustered in a subset of
+# devices, and how bad some of those devices might be. Expecting daily values
+# to be normally >1,000 but <100,000.
+################################################################################
+- id: 2008
+  metric_name: nand_read_attempt_failures
+  metric_type: OCCURRENCE
+  metric_semantics: [USAGE_COUNTING]
+  reports:
+    ############################################################################
+    # A daily histogram of read failures per device.
+    ############################################################################
+    - report_name: failures_histogram
+      id: 1
+      report_type: UNIQUE_DEVICE_HISTOGRAMS
+      local_aggregation_period: WINDOW_1_DAY
+      int_buckets:
+        exponential:
+          # 0, 1, 2, 4, 8... 32768, 65536
+          floor: 0
+          num_buckets: 18
+          initial_step: 1
+          step_multiplier: 2
+      privacy_level: NO_ADDED_PRIVACY
+      system_profile_selection: SELECT_LAST
+      system_profile_field: [BOARD_NAME, PRODUCT_NAME, SYSTEM_VERSION, CHANNEL]
+  meta_data:
+    max_release_stage: GA
+    expiration_date: "2022/10/20"
+
+################################################################################
+# Number of failed reads where we exhausted all attempts
+#
+# A count of how many reads where no attempts sent to the nand succeed.
+#
+# For insight into how fleetwide failures might be clustered in a subset of
+# devices, and how bad some of those devices might be. Expecting daily values
+# to be normally >1 but <100. Could in theory be higher if instances of this
+# didn't usually result in the system becoming inoperable.
+################################################################################
+- id: 2009
+  metric_name: nand_read_attempts_exhausted
+  metric_type: OCCURRENCE
+  metric_semantics: [USAGE_COUNTING]
+  reports:
+    ############################################################################
+    # A daily histogram of exhausted read attempts
+    ############################################################################
+    - report_name: exhausted_histogram
+      id: 1
+      report_type: UNIQUE_DEVICE_HISTOGRAMS
+      local_aggregation_period: WINDOW_1_DAY
+      int_buckets:
+        exponential:
+          # 0, 1, 2, 4, 8... 64, 128
+          floor: 0
+          num_buckets: 9
+          initial_step: 1
+          step_multiplier: 2
+      privacy_level: NO_ADDED_PRIVACY
+      system_profile_selection: SELECT_LAST
+      system_profile_field: [BOARD_NAME, PRODUCT_NAME, SYSTEM_VERSION, CHANNEL]
+  meta_data:
+    max_release_stage: GA
+    expiration_date: "2022/10/20"
+
+################################################################################
+# Number of bad blocks found on a device
+#
+# A count of how many blocks are found bad as marked by the initial factory
+# mark, and how many are found and marked as bad during run time.
+#
+# For insight into the fleetwide state regarding the availability of the spare
+# block pools, determined by how many blocks are currently marked bad. Also the
+# split of initial bad blocks vs running bad blocks. For current use-cases 44
+# is really the maximum.
+################################################################################
+- id: 2010
+  metric_name: ftl_bad_blocks
+  metric_type: INTEGER
+  metric_semantics: [USAGE_COUNTING]
+  metric_units_other: "blocks"
+  metric_dimensions:
+   - dimension: bad_block_type
+     event_codes:
+       0: Unknown
+       1: INITIAL
+       2: RUNNING
+  reports:
+    ############################################################################
+    # A daily histogram of bad block information per device.
+    ############################################################################
+    - report_name: bad_blocks_histogram
+      id: 1
+      report_type: UNIQUE_DEVICE_HISTOGRAMS
+      local_aggregation_period: WINDOW_1_DAY
+      local_aggregation_procedure: MAX_PROCEDURE
+      int_buckets:
+        exponential:
+          # 0, 1, 2, 4, 8... 32, 64
+          floor: 0
+          num_buckets: 8
+          initial_step: 1
+          step_multiplier: 2
+      privacy_level: NO_ADDED_PRIVACY
+      system_profile_selection: SELECT_LAST
+      system_profile_field: [BOARD_NAME, PRODUCT_NAME, SYSTEM_VERSION, CHANNEL]
+  meta_data:
+    max_release_stage: GA
+    expiration_date: "2022/10/20"
+# Next ID: 2011
 
 ################################################################################
commit	02d0e1509f8fbbe8141efda444023f0c916474ef	[log] [tgz]
author	Martin Lindsay <mlindsay@google.com>	Fri Feb 04 20:37:20 2022 +0000
committer	releases-try-builder <releases-try-builder@fuchsia-infra.iam.gserviceaccount.com>	Sat Feb 05 00:16:27 2022 +0000
tree	c46553ed4954ae67c51671fa3c8e2c1fb0d23cd7
parent	70564468182f69818aaa2db04745e7cfa7271d43 [diff]