blob: 720da372568933b966e81b16e7da7b99ccd0d13d [file] [log] [blame]
// Copyright 2022 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//! The Transmission Control Protocol (TCP).
pub mod buffer;
mod congestion;
mod rtt;
pub mod segment;
mod seqnum;
pub mod socket;
pub mod state;
use core::{
num::{NonZeroU16, NonZeroU8},
time::Duration,
};
use const_unwrap::const_unwrap_option;
use net_types::ip::{GenericOverIp, Ip, IpMarked, IpVersion};
use packet_formats::{
icmp::{Icmpv4DestUnreachableCode, Icmpv6DestUnreachableCode},
utils::NonZeroDuration,
};
use rand::RngCore;
use crate::{
counters::Counter,
device,
ip::{
icmp::{IcmpErrorCode, Icmpv4ErrorCode, Icmpv6ErrorCode},
socket::Mms,
IpExt,
},
transport::tcp::{
seqnum::{UnscaledWindowSize, WindowSize},
socket::{isn::IsnGenerator, DualStackIpExt, Sockets},
state::DEFAULT_MAX_SYN_RETRIES,
},
};
use self::socket::TcpBindingsTypes;
/// Default lifetime for a orphaned connection in FIN_WAIT2.
pub const DEFAULT_FIN_WAIT2_TIMEOUT: Duration = Duration::from_secs(60);
/// Control flags that can alter the state of a TCP control block.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Control {
/// Corresponds to the SYN bit in a TCP segment.
SYN,
/// Corresponds to the FIN bit in a TCP segment.
FIN,
/// Corresponds to the RST bit in a TCP segment.
RST,
}
impl Control {
/// Returns whether the control flag consumes one byte from the sequence
/// number space.
fn has_sequence_no(self) -> bool {
match self {
Control::SYN | Control::FIN => true,
Control::RST => false,
}
}
}
/// Errors surfaced to the user.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum ConnectionError {
/// The connection was reset because of a RST segment.
ConnectionReset,
/// The connection was closed because the network is unreachable.
NetworkUnreachable,
/// The connection was closed because the host is unreachable.
HostUnreachable,
/// The connection was closed because the protocol is unreachable.
ProtocolUnreachable,
/// The connection was closed because the port is unreachable.
PortUnreachable,
/// The connection was closed because the host is down.
DestinationHostDown,
/// The connection was closed because the source route failed.
SourceRouteFailed,
/// The connection was closed because the source host is isolated.
SourceHostIsolated,
/// The connection was closed because of a time out.
TimedOut,
}
impl From<IcmpErrorCode> for Option<ConnectionError> {
// Notes: the following mappings are guided by the packetimpact test here:
// https://cs.opensource.google/gvisor/gvisor/+/master:test/packetimpact/tests/tcp_network_unreachable_test.go;drc=611e6e1247a0691f5fd198f411c68b3bc79d90af
fn from(err: IcmpErrorCode) -> Self {
match err {
IcmpErrorCode::V4(Icmpv4ErrorCode::DestUnreachable(code)) => match code {
Icmpv4DestUnreachableCode::DestNetworkUnreachable => {
Some(ConnectionError::NetworkUnreachable)
}
Icmpv4DestUnreachableCode::DestHostUnreachable => {
Some(ConnectionError::HostUnreachable)
}
Icmpv4DestUnreachableCode::DestProtocolUnreachable => {
Some(ConnectionError::ProtocolUnreachable)
}
Icmpv4DestUnreachableCode::DestPortUnreachable => {
Some(ConnectionError::PortUnreachable)
}
Icmpv4DestUnreachableCode::FragmentationRequired => None,
Icmpv4DestUnreachableCode::SourceRouteFailed => {
Some(ConnectionError::SourceRouteFailed)
}
Icmpv4DestUnreachableCode::DestNetworkUnknown => {
Some(ConnectionError::NetworkUnreachable)
}
Icmpv4DestUnreachableCode::DestHostUnknown => {
Some(ConnectionError::DestinationHostDown)
}
Icmpv4DestUnreachableCode::SourceHostIsolated => {
Some(ConnectionError::SourceHostIsolated)
}
Icmpv4DestUnreachableCode::NetworkAdministrativelyProhibited => {
Some(ConnectionError::NetworkUnreachable)
}
Icmpv4DestUnreachableCode::HostAdministrativelyProhibited => {
Some(ConnectionError::HostUnreachable)
}
Icmpv4DestUnreachableCode::NetworkUnreachableForToS => {
Some(ConnectionError::NetworkUnreachable)
}
Icmpv4DestUnreachableCode::HostUnreachableForToS => {
Some(ConnectionError::HostUnreachable)
}
Icmpv4DestUnreachableCode::CommAdministrativelyProhibited => {
Some(ConnectionError::HostUnreachable)
}
Icmpv4DestUnreachableCode::HostPrecedenceViolation => {
Some(ConnectionError::HostUnreachable)
}
Icmpv4DestUnreachableCode::PrecedenceCutoffInEffect => {
Some(ConnectionError::HostUnreachable)
}
},
// TODO(https://fxbug.dev/42052672): Map the following ICMP messages.
IcmpErrorCode::V4(
Icmpv4ErrorCode::ParameterProblem(_)
| Icmpv4ErrorCode::Redirect(_)
| Icmpv4ErrorCode::TimeExceeded(_),
) => None,
IcmpErrorCode::V6(Icmpv6ErrorCode::DestUnreachable(code)) => match code {
Icmpv6DestUnreachableCode::NoRoute => Some(ConnectionError::NetworkUnreachable),
Icmpv6DestUnreachableCode::CommAdministrativelyProhibited => {
Some(ConnectionError::HostUnreachable)
}
Icmpv6DestUnreachableCode::BeyondScope => Some(ConnectionError::NetworkUnreachable),
Icmpv6DestUnreachableCode::AddrUnreachable => {
Some(ConnectionError::HostUnreachable)
}
Icmpv6DestUnreachableCode::PortUnreachable => {
Some(ConnectionError::PortUnreachable)
}
Icmpv6DestUnreachableCode::SrcAddrFailedPolicy => {
Some(ConnectionError::SourceRouteFailed)
}
Icmpv6DestUnreachableCode::RejectRoute => Some(ConnectionError::NetworkUnreachable),
},
// TODO(https://fxbug.dev/42052672): Map the following ICMP messages.
IcmpErrorCode::V6(
Icmpv6ErrorCode::PacketTooBig
| Icmpv6ErrorCode::ParameterProblem(_)
| Icmpv6ErrorCode::TimeExceeded(_),
) => None,
}
}
}
#[derive(GenericOverIp)]
#[generic_over_ip(I, Ip)]
pub(crate) struct TcpState<I: DualStackIpExt, D: device::WeakId, BT: TcpBindingsTypes> {
pub(crate) isn_generator: IsnGenerator<BT::Instant>,
pub(crate) sockets: Sockets<I, D, BT>,
pub(crate) counters: TcpCounters<I>,
}
impl<I: DualStackIpExt, D: device::WeakId, BT: TcpBindingsTypes> TcpState<I, D, BT> {
pub(crate) fn new(now: BT::Instant, rng: &mut impl RngCore) -> Self {
Self {
isn_generator: IsnGenerator::new(now, rng),
sockets: Sockets::new(),
counters: Default::default(),
}
}
}
const TCP_HEADER_LEN: u32 = packet_formats::tcp::HDR_PREFIX_LEN as u32;
/// Maximum segment size, that is the maximum TCP payload one segment can carry.
#[derive(Clone, Copy, PartialEq, Eq, Debug, PartialOrd, Ord)]
pub(crate) struct Mss(NonZeroU16);
impl Mss {
/// Creates MSS from the maximum message size of the IP layer.
fn from_mms<I: IpExt>(mms: Mms) -> Option<Self> {
NonZeroU16::new(
u16::try_from(mms.get().get().saturating_sub(TCP_HEADER_LEN)).unwrap_or(u16::MAX),
)
.map(Self)
}
const fn default<I: Ip>() -> Self {
// Per RFC 9293 Section 3.7.1:
// If an MSS Option is not received at connection setup, TCP
// implementations MUST assume a default send MSS of 536 (576 - 40) for
// IPv4 or 1220 (1280 - 60) for IPv6 (MUST-15).
match I::VERSION {
IpVersion::V4 => Mss(const_unwrap_option(NonZeroU16::new(536))),
IpVersion::V6 => Mss(const_unwrap_option(NonZeroU16::new(1220))),
}
}
/// Gets the numeric value of the MSS.
const fn get(&self) -> NonZeroU16 {
let Self(mss) = *self;
mss
}
}
impl From<Mss> for u32 {
fn from(Mss(mss): Mss) -> Self {
u32::from(mss.get())
}
}
/// Named tuple for holding sizes of buffers for a socket.
#[derive(Copy, Clone, Debug)]
#[cfg_attr(test, derive(Eq, PartialEq))]
pub struct BufferSizes {
/// The size of the send buffer.
pub send: usize,
/// The size of the receive buffer.
pub receive: usize,
}
/// Sensible defaults only for testing.
#[cfg(any(test, feature = "testutils"))]
impl Default for BufferSizes {
fn default() -> Self {
BufferSizes {
send: seqnum::WindowSize::DEFAULT.into(),
receive: seqnum::WindowSize::DEFAULT.into(),
}
}
}
#[derive(Debug)]
pub(crate) struct OptionalBufferSizes {
pub(crate) send: Option<usize>,
pub(crate) receive: Option<usize>,
}
impl BufferSizes {
fn into_optional(&self) -> OptionalBufferSizes {
let Self { send, receive } = self;
OptionalBufferSizes { send: Some(*send), receive: Some(*receive) }
}
fn rwnd(&self) -> WindowSize {
let Self { send: _, receive } = *self;
WindowSize::new(receive).unwrap_or(WindowSize::MAX)
}
fn rwnd_unscaled(&self) -> UnscaledWindowSize {
let Self { send: _, receive } = *self;
UnscaledWindowSize::from(u16::try_from(receive).unwrap_or(u16::MAX))
}
}
/// TCP socket options.
///
/// This only stores options that are trivial to get and set.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct SocketOptions {
/// Socket options that control TCP keep-alive mechanism, see [`KeepAlive`].
pub keep_alive: KeepAlive,
/// Switch to turn nagle algorithm on/off.
pub nagle_enabled: bool,
/// The period of time after which the connection should be aborted if no
/// ACK is received.
pub user_timeout: Option<NonZeroDuration>,
/// Switch to turn delayed ACK on/off.
pub delayed_ack: bool,
/// The period of time after with a dangling FIN_WAIT2 state should be
/// reclaimed.
pub fin_wait2_timeout: Option<Duration>,
/// The maximum SYN retransmissions before aborting a connection.
pub max_syn_retries: NonZeroU8,
}
impl Default for SocketOptions {
fn default() -> Self {
Self {
keep_alive: KeepAlive::default(),
// RFC 9293 Section 3.7.4:
// A TCP implementation SHOULD implement the Nagle algorithm to
// coalesce short segments
nagle_enabled: true,
user_timeout: None,
// RFC 9293 Section 4.2:
// The delayed ACK algorithm specified in [RFC1122] SHOULD be used
// by a TCP receiver.
// Delayed acks have *bad* performance for connections that are not
// interactive, especially when combined with the Nagle algorithm.
// We disable it by default here because:
// 1. RFC does not say MUST;
// 2. Common implementations like Linux has it turned off by
// default.
// More context: https://news.ycombinator.com/item?id=10607422
delayed_ack: false,
fin_wait2_timeout: Some(DEFAULT_FIN_WAIT2_TIMEOUT),
max_syn_retries: DEFAULT_MAX_SYN_RETRIES,
}
}
}
/// Options that are related to TCP keep-alive.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct KeepAlive {
/// The amount of time for an idle connection to wait before sending out
/// probes.
pub idle: NonZeroDuration,
/// Interval between consecutive probes.
pub interval: NonZeroDuration,
/// Maximum number of probes we send before considering the connection dead.
///
/// `u8` is enough because if a connection doesn't hear back from the peer
/// after 256 probes, then chances are that the connection is already dead.
pub count: NonZeroU8,
/// Only send probes if keep-alive is enabled.
pub enabled: bool,
}
impl Default for KeepAlive {
fn default() -> Self {
Self {
// Default values inspired by Linux's TCP implementation:
// https://github.com/torvalds/linux/blob/0326074ff4652329f2a1a9c8685104576bd8d131/include/net/tcp.h#L155-L157
idle: const_unwrap::const_unwrap_option(NonZeroDuration::from_secs(2 * 60 * 60)),
interval: const_unwrap::const_unwrap_option(NonZeroDuration::from_secs(75)),
count: const_unwrap_option(NonZeroU8::new(9)),
// Per RFC 9293(https://datatracker.ietf.org/doc/html/rfc9293#section-3.8.4):
// ... they MUST default to off.
enabled: false,
}
}
}
/// TCP Counters.
///
/// Accrued for the entire stack, rather than on a per connection basis.
///
/// Note that for dual stack sockets, all events will be attributed to the IPv6
/// counters.
pub type TcpCounters<I> = IpMarked<I, TcpCountersInner>;
/// The IP agnostic version of [`TcpCounters`].
#[derive(Default)]
// TODO(https://fxbug.dev/42052878): Add counters for SYN cookies.
// TODO(https://fxbug.dev/42078221): Add counters for SACK.
pub struct TcpCountersInner {
/// Count of received IP packets that were dropped because they had
/// unexpected IP addresses (either src or dst).
pub invalid_ip_addrs_received: Counter,
/// Count of received TCP segments that were dropped because they could not
/// be parsed.
pub invalid_segments_received: Counter,
/// Count of received TCP segments that were valid.
pub valid_segments_received: Counter,
/// Count of received TCP segments that were successfully dispatched to a
/// socket.
pub received_segments_dispatched: Counter,
/// Count of received TCP segments that were not associated with any
/// existing sockets.
pub received_segments_no_dispatch: Counter,
/// Count of received TCP segments that were dropped because the listener
/// queue was full.
pub listener_queue_overflow: Counter,
/// Count of TCP segments that failed to send.
pub segment_send_errors: Counter,
/// Count of TCP segments that were sent.
pub segments_sent: Counter,
/// Count of passive open attempts that failed because the stack doesn't
/// have route to the peer.
pub passive_open_no_route_errors: Counter,
/// Count of passive connections that have been opened.
pub passive_connection_openings: Counter,
/// Count of active open attempts that have failed because the stack doesn't
/// have a route to the peer.
pub active_open_no_route_errors: Counter,
/// Count of active connections that have been opened.
pub active_connection_openings: Counter,
/// Count of all failed connection attempts, including both passive and
/// active opens.
pub failed_connection_attempts: Counter,
/// Count of port reservation attempts that failed.
pub failed_port_reservations: Counter,
/// Count of received segments whose checksums were invalid.
pub checksum_errors: Counter,
/// Count of received segments with the RST flag set.
pub resets_received: Counter,
/// Count of sent segments with the RST flag set.
pub resets_sent: Counter,
/// Count of received segments with the SYN flag set.
pub syns_received: Counter,
/// Count of sent segments with the SYN flag set.
pub syns_sent: Counter,
/// Count of received segments with the FIN flag set.
pub fins_received: Counter,
/// Count of sent segments with the FIN flag set.
pub fins_sent: Counter,
/// Count of retransmission timeouts.
pub timeouts: Counter,
/// Count of retransmissions of segments.
pub retransmits: Counter,
/// Count of retransmissions of segments while in slow start.
pub slow_start_retransmits: Counter,
/// Count of retransmissions of segments while in fast recovery.
pub fast_retransmits: Counter,
/// Count of times fast recovery was initiated to recover from packet loss.
pub fast_recovery: Counter,
/// Count of times an established TCP connection transitioned to CLOSED.
pub established_closed: Counter,
/// Count of times an established TCP connection transitioned to CLOSED due
/// to a RST segment.
pub established_resets: Counter,
/// Count of times an established TCP connection transitioned to CLOSED due
/// to a timeout (e.g. a keep-alive or retransmit timeout).
pub established_timedout: Counter,
}
#[cfg(test)]
mod testutil {
use super::Mss;
/// Per RFC 879 section 1 (https://tools.ietf.org/html/rfc879#section-1):
///
/// THE TCP MAXIMUM SEGMENT SIZE IS THE IP MAXIMUM DATAGRAM SIZE MINUS
/// FORTY.
/// The default IP Maximum Datagram Size is 576.
/// The default TCP Maximum Segment Size is 536.
pub(super) const DEFAULT_IPV4_MAXIMUM_SEGMENT_SIZE_USIZE: usize = 536;
pub(super) const DEFAULT_IPV4_MAXIMUM_SEGMENT_SIZE: Mss =
Mss(const_unwrap::const_unwrap_option(core::num::NonZeroU16::new(
DEFAULT_IPV4_MAXIMUM_SEGMENT_SIZE_USIZE as u16,
)));
}