blob: a4bc08a7746d49dd6b68c33c907d887403b3d7a2 [file] [log] [blame]
// Copyright 2020 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
use fuchsia_inspect::{
ExponentialHistogramParams, HistogramProperty, LinearHistogramParams, Node, NumericProperty,
UintExponentialHistogramProperty, UintLinearHistogramProperty, UintProperty,
};
use fuchsia_sync::Mutex;
use fuchsia_zircon::{self as zx, Duration};
use lazy_static::lazy_static;
use std::{
collections::BTreeMap,
sync::{
atomic::{AtomicUsize, Ordering},
Arc, OnceLock,
},
};
lazy_static! {
// Exponential histograms for time in microseconds contains power-of-two intervals
static ref TIME_USEC_PARAMS : ExponentialHistogramParams<u64> = ExponentialHistogramParams {
floor: 0,
initial_step: 1,
step_multiplier: 2,
buckets: 26,
};
// Linear histogram for max snapshot size in bytes requested by clients.
// Divide configs into 10kb buckets, from 0mb to 1mb.
static ref MAX_SNAPSHOT_SIZE_BYTES_PARAMS : LinearHistogramParams<u64> = LinearHistogramParams {
floor: 0,
step_size: 10000,
buckets: 100,
};
// Linear histogram tracking percent of schemas truncated for a given snapshot.
// Divide configs into 5% buckets, from 0% to 100%.
static ref SNAPSHOT_SCHEMA_TRUNCATION_PARAMS : LinearHistogramParams<u64> = LinearHistogramParams {
floor: 0,
step_size: 5,
buckets: 20,
};
}
pub struct AccessorStats {
/// Inspect node for tracking usage/health metrics of diagnostics platform.
_node: Node,
/// Metrics aggregated across all client connections.
pub global_stats: Arc<GlobalAccessorStats>,
/// Global stats tracking the usages of StreamDiagnostics for
/// exfiltrating inspect data.
pub inspect_stats: Arc<GlobalConnectionStats>,
/// Global stats tracking the usages of StreamDiagnostics for
/// exfiltrating logs.
pub logs_stats: Arc<GlobalConnectionStats>,
}
pub struct GlobalAccessorStats {
/// Property tracking number of opening connections to any archive_accessor instance.
pub connections_opened: UintProperty,
/// Property tracking number of closing connections to any archive_accessor instance.
pub connections_closed: UintProperty,
/// Number of requests to a single ArchiveAccessor to StreamDiagnostics, starting a
/// new inspect ReaderServer.
pub stream_diagnostics_requests: UintProperty,
}
impl AccessorStats {
pub fn new(node: Node) -> Self {
let connections_opened = node.create_uint("connections_opened", 0);
let connections_closed = node.create_uint("connections_closed", 0);
let stream_diagnostics_requests = node.create_uint("stream_diagnostics_requests", 0);
let inspect_stats = Arc::new(GlobalConnectionStats::new(node.create_child("inspect")));
let logs_stats = Arc::new(GlobalConnectionStats::new(node.create_child("logs")));
AccessorStats {
_node: node,
global_stats: Arc::new(GlobalAccessorStats {
connections_opened,
connections_closed,
stream_diagnostics_requests,
}),
inspect_stats,
logs_stats,
}
}
pub fn new_inspect_batch_iterator(&self) -> BatchIteratorConnectionStats {
self.inspect_stats.new_batch_iterator_connection()
}
pub fn new_logs_batch_iterator(&self) -> BatchIteratorConnectionStats {
self.logs_stats.new_batch_iterator_connection()
}
}
pub struct GlobalConnectionStats {
/// Weak clone of the node that stores stats, used for on-demand population.
node: Node,
/// Number of DiagnosticsServers created in response to an StreamDiagnostics
/// client request.
reader_servers_constructed: UintProperty,
/// Number of DiagnosticsServers destroyed in response to falling out of scope.
reader_servers_destroyed: UintProperty,
/// Stats about BatchIterator connections.
batch_iterator: GlobalBatchIteratorStats,
/// Property tracking number of times a future to retrieve diagnostics data for a component
/// timed out.
component_timeouts_count: UintProperty,
/// Number of times a diagnostics schema had to be truncated because it would otherwise
/// cause a component to exceed its configured size budget.
schema_truncation_count: UintProperty,
/// Optional histogram of processing times for individual components in GetNext
component_time_usec: OnceLock<UintExponentialHistogramProperty>,
/// Histogram of max aggregated snapshot sizes for overall Snapshot requests.
max_snapshot_sizes_bytes: UintLinearHistogramProperty,
/// Percentage of schemas in a single snapshot that got truncated.
snapshot_schema_truncation_percentage: UintLinearHistogramProperty,
/// Longest processing times for individual components, with timestamps.
processing_time_tracker: OnceLock<Mutex<ProcessingTimeTracker>>,
/// Node under which the batch iterator connections stats are created.
batch_iterator_connections: Node,
/// The id of the next BatchIterator connection.
next_connection_id: AtomicUsize,
}
impl GlobalConnectionStats {
pub fn new(node: Node) -> Self {
let reader_servers_constructed = node.create_uint("reader_servers_constructed", 0);
let reader_servers_destroyed = node.create_uint("reader_servers_destroyed", 0);
let batch_iterator = GlobalBatchIteratorStats::new(&node);
let component_timeouts_count = node.create_uint("component_timeouts_count", 0);
let max_snapshot_sizes_bytes = node.create_uint_linear_histogram(
"max_snapshot_sizes_bytes",
MAX_SNAPSHOT_SIZE_BYTES_PARAMS.clone(),
);
let snapshot_schema_truncation_percentage = node.create_uint_linear_histogram(
"snapshot_schema_truncation_percentage",
SNAPSHOT_SCHEMA_TRUNCATION_PARAMS.clone(),
);
let schema_truncation_count = node.create_uint("schema_truncation_count", 0);
let batch_iterator_connections = node.create_child("batch_iterator_connections");
GlobalConnectionStats {
node,
reader_servers_constructed,
reader_servers_destroyed,
batch_iterator,
batch_iterator_connections,
component_timeouts_count,
max_snapshot_sizes_bytes,
snapshot_schema_truncation_percentage,
schema_truncation_count,
component_time_usec: OnceLock::new(),
processing_time_tracker: OnceLock::new(),
next_connection_id: AtomicUsize::new(0),
}
}
fn new_batch_iterator_connection(self: &Arc<Self>) -> BatchIteratorConnectionStats {
let node = self
.batch_iterator_connections
.create_child(self.next_connection_id.fetch_add(1, Ordering::Relaxed).to_string());
BatchIteratorConnectionStats::new(node, Arc::clone(self))
}
pub fn add_timeout(&self) {
self.component_timeouts_count.add(1);
}
pub fn record_percent_truncated_schemas(&self, percent_truncated_schemas: u64) {
self.snapshot_schema_truncation_percentage.insert(percent_truncated_schemas);
}
pub fn record_max_snapshot_size_config(&self, max_snapshot_size_config: u64) {
self.max_snapshot_sizes_bytes.insert(max_snapshot_size_config);
}
/// Record the duration of a whole request to GetNext.
pub fn record_batch_duration(&self, duration: Duration) {
let micros = duration.into_micros();
if micros >= 0 {
self.batch_iterator.get_next.time_usec.insert(micros as u64);
}
}
/// Record the duration of obtaining data from a single component.
pub fn record_component_duration(&self, moniker: impl AsRef<str>, duration: Duration) {
let nanos = duration.into_nanos();
if nanos >= 0 {
// Lazily initialize stats that may not be needed for all diagnostics types.
let component_time_usec = self.component_time_usec.get_or_init(|| {
self.node.create_uint_exponential_histogram(
"component_time_usec",
TIME_USEC_PARAMS.clone(),
)
});
let processing_time_tracker = self.processing_time_tracker.get_or_init(|| {
Mutex::new(ProcessingTimeTracker::new(
self.node.create_child("longest_processing_times"),
))
});
component_time_usec.insert(nanos as u64 / 1000);
processing_time_tracker.lock().track(moniker.as_ref(), nanos as u64);
}
}
}
struct GlobalBatchIteratorStats {
_node: Node,
/// Property tracking number of opening connections to any batch iterator instance.
connections_opened: UintProperty,
/// Property tracking number of closing connections to any batch iterator instance.
connections_closed: UintProperty,
get_next: GlobalBatchIteratorGetNextStats,
}
impl GlobalBatchIteratorStats {
fn new(parent: &Node) -> Self {
let node = parent.create_child("batch_iterator");
let connections_opened = node.create_uint("connections_opened", 0);
let connections_closed = node.create_uint("connections_closed", 0);
let get_next = GlobalBatchIteratorGetNextStats::new(&node);
Self { _node: node, connections_opened, connections_closed, get_next }
}
}
struct GlobalBatchIteratorGetNextStats {
_node: Node,
/// Number of times "GetNext" was called
requests: UintProperty,
/// Number of times a "GetNext" response was sent
responses: UintProperty,
/// Number of items returned in batches from "GetNext"
result_count: UintProperty,
/// Number of items returned in batches from "GetNext" that contained errors
result_errors: UintProperty,
/// Histogram of processing times for overall "GetNext" requests.
time_usec: UintExponentialHistogramProperty,
}
impl GlobalBatchIteratorGetNextStats {
fn new(parent: &Node) -> Self {
let node = parent.create_child("get_next");
let requests = node.create_uint("requests", 0);
let responses = node.create_uint("responses", 0);
let result_count = node.create_uint("result_count", 0);
let result_errors = node.create_uint("result_errors", 0);
let time_usec =
node.create_uint_exponential_histogram("time_usec", TIME_USEC_PARAMS.clone());
Self { _node: node, requests, responses, result_count, result_errors, time_usec }
}
}
const PROCESSING_TIME_COMPONENT_COUNT_LIMIT: usize = 20;
/// Holds stats on the longest processing times for individual components' data.
struct ProcessingTimeTracker {
/// The node holding all properties for the tracker.
node: Node,
/// Map from component moniker to a tuple of its time and a node containing the stats about it.
longest_times_by_component: BTreeMap<String, (u64, Node)>,
/// The shortest time seen so far. If a new component is being
/// recorded and its time is greater than this, we need to pop the
/// entry containing this time.
shortest_time_ns: u64,
}
impl ProcessingTimeTracker {
fn new(node: Node) -> Self {
Self { node, longest_times_by_component: BTreeMap::new(), shortest_time_ns: u64::MAX }
}
fn track(&mut self, moniker: &str, time_ns: u64) {
let at_capacity =
self.longest_times_by_component.len() >= PROCESSING_TIME_COMPONENT_COUNT_LIMIT;
// Do nothing if the container it as the limit and the new time doesn't need to get
// inserted.
if at_capacity && time_ns < self.shortest_time_ns {
return;
}
let parent_node = &self.node;
let make_entry = || {
let n = parent_node.create_child(moniker.to_string());
n.record_int("@time", zx::Time::get_monotonic().into_nanos());
n.record_double("duration_seconds", time_ns as f64 / 1e9);
(time_ns, n)
};
self.longest_times_by_component
.entry(moniker.to_string())
.and_modify(move |v| {
if v.0 < time_ns {
*v = make_entry();
}
})
.or_insert_with(make_entry);
// Repeatedly find the key for the smallest time and remove it until we are under the
// limit.
while self.longest_times_by_component.len() > PROCESSING_TIME_COMPONENT_COUNT_LIMIT {
let mut key = "".to_string();
for (k, (val, _)) in &self.longest_times_by_component {
if *val == self.shortest_time_ns {
key.clone_from(k);
break;
}
}
self.longest_times_by_component.remove(&key);
self.shortest_time_ns =
self.longest_times_by_component.values().map(|v| v.0).min().unwrap_or(u64::MAX);
}
self.shortest_time_ns = std::cmp::min(self.shortest_time_ns, time_ns);
}
}
pub struct BatchIteratorConnectionStats {
/// Inspect node for tracking usage/health metrics of a single connection to a batch iterator.
_node: Node,
/// Global stats for connections to the BatchIterator protocol.
global_stats: Arc<GlobalConnectionStats>,
/// Property tracking number of requests to the BatchIterator instance this struct is tracking.
get_next_requests: UintProperty,
/// Property tracking number of responses from the BatchIterator instance this struct is tracking.
get_next_responses: UintProperty,
/// Property tracking number of times the batch iterator has served a terminal batch signalling that
/// the client has reached the end of the iterator and should terminate their connection.
get_next_terminal_responses: UintProperty,
}
impl BatchIteratorConnectionStats {
fn new(node: Node, global_stats: Arc<GlobalConnectionStats>) -> Self {
// we'll decrement these on drop
global_stats.reader_servers_constructed.add(1);
let get_next = node.create_child("get_next");
let get_next_requests = get_next.create_uint("requests", 0);
let get_next_responses = get_next.create_uint("responses", 0);
let get_next_terminal_responses = get_next.create_uint("terminal_responses", 0);
node.record(get_next);
Self {
_node: node,
global_stats,
get_next_requests,
get_next_responses,
get_next_terminal_responses,
}
}
pub fn open_connection(&self) {
self.global_stats.batch_iterator.connections_opened.add(1);
}
pub fn close_connection(&self) {
self.global_stats.batch_iterator.connections_closed.add(1);
}
pub fn global_stats(&self) -> &Arc<GlobalConnectionStats> {
&self.global_stats
}
pub fn add_request(&self) {
self.global_stats.batch_iterator.get_next.requests.add(1);
self.get_next_requests.add(1);
}
pub fn add_response(&self) {
self.global_stats.batch_iterator.get_next.responses.add(1);
self.get_next_responses.add(1);
}
pub fn add_terminal(&self) {
self.get_next_terminal_responses.add(1);
}
pub fn add_result(&self) {
self.global_stats.batch_iterator.get_next.result_count.add(1);
}
pub fn add_result_error(&self) {
self.global_stats.batch_iterator.get_next.result_errors.add(1);
}
pub fn add_schema_truncated(&self) {
self.global_stats.schema_truncation_count.add(1);
}
}
impl Drop for BatchIteratorConnectionStats {
fn drop(&mut self) {
self.global_stats.reader_servers_destroyed.add(1);
}
}
#[cfg(test)]
mod test {
use super::*;
use diagnostics_assertions::{assert_data_tree, AnyProperty};
use fuchsia_inspect::{component, health::Reporter, Inspector};
#[fuchsia::test]
fn health() {
component::health().set_ok();
assert_data_tree!(component::inspector(),
root: {
"fuchsia.inspect.Health": {
status: "OK",
start_timestamp_nanos: AnyProperty,
}
});
component::health().set_unhealthy("Bad state");
assert_data_tree!(component::inspector(),
root: contains {
"fuchsia.inspect.Health": {
status: "UNHEALTHY",
message: "Bad state",
start_timestamp_nanos: AnyProperty,
}
});
component::health().set_ok();
assert_data_tree!(component::inspector(),
root: contains {
"fuchsia.inspect.Health": {
status: "OK",
start_timestamp_nanos: AnyProperty,
}
});
}
#[fuchsia::test]
fn processing_time_tracker() {
let inspector = Inspector::default();
let mut tracker = ProcessingTimeTracker::new(inspector.root().create_child("test"));
tracker.track("a", 1e9 as u64);
assert_data_tree!(inspector,
root: {
test: {
a: {
"@time": AnyProperty,
duration_seconds: 1f64
}
}
});
tracker.track("a", 5e8 as u64);
assert_data_tree!(inspector,
root: {
test: {
a: {
"@time": AnyProperty,
duration_seconds: 1f64
}
}
});
tracker.track("a", 5500e6 as u64);
assert_data_tree!(inspector,
root: {
test: {
a: {
"@time": AnyProperty,
duration_seconds: 5.5f64
}
}
});
for time in 0..60 {
tracker.track(&format!("b{time}"), time * 1e9 as u64);
}
assert_data_tree!(inspector,
root: {
test: {
b40: { "@time": AnyProperty, duration_seconds: 40f64 },
b41: { "@time": AnyProperty, duration_seconds: 41f64 },
b42: { "@time": AnyProperty, duration_seconds: 42f64 },
b43: { "@time": AnyProperty, duration_seconds: 43f64 },
b44: { "@time": AnyProperty, duration_seconds: 44f64 },
b45: { "@time": AnyProperty, duration_seconds: 45f64 },
b46: { "@time": AnyProperty, duration_seconds: 46f64 },
b47: { "@time": AnyProperty, duration_seconds: 47f64 },
b48: { "@time": AnyProperty, duration_seconds: 48f64 },
b49: { "@time": AnyProperty, duration_seconds: 49f64 },
b50: { "@time": AnyProperty, duration_seconds: 50f64 },
b51: { "@time": AnyProperty, duration_seconds: 51f64 },
b52: { "@time": AnyProperty, duration_seconds: 52f64 },
b53: { "@time": AnyProperty, duration_seconds: 53f64 },
b54: { "@time": AnyProperty, duration_seconds: 54f64 },
b55: { "@time": AnyProperty, duration_seconds: 55f64 },
b56: { "@time": AnyProperty, duration_seconds: 56f64 },
b57: { "@time": AnyProperty, duration_seconds: 57f64 },
b58: { "@time": AnyProperty, duration_seconds: 58f64 },
b59: { "@time": AnyProperty, duration_seconds: 59f64 },
}
});
}
}