blob: e6672019f4c5cd54617caa4f925eca3c2ee221c2 [file] [log] [blame]
// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tcp
import (
"encoding/binary"
"fmt"
"math"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/google/netstack/rand"
"github.com/google/netstack/sleep"
"github.com/google/netstack/tcpip"
"github.com/google/netstack/tcpip/buffer"
"github.com/google/netstack/tcpip/hash/jenkins"
"github.com/google/netstack/tcpip/header"
"github.com/google/netstack/tcpip/iptables"
"github.com/google/netstack/tcpip/seqnum"
"github.com/google/netstack/tcpip/stack"
"github.com/google/netstack/tmutex"
"github.com/google/netstack/waiter"
)
// EndpointState represents the state of a TCP endpoint.
type EndpointState uint32
// Endpoint states. Note that are represented in a netstack-specific manner and
// may not be meaningful externally. Specifically, they need to be translated to
// Linux's representation for these states if presented to userspace.
const (
// Endpoint states internal to netstack. These map to the TCP state CLOSED.
StateInitial EndpointState = iota
StateBound
StateConnecting // Connect() called, but the initial SYN hasn't been sent.
StateError
// TCP protocol states.
StateEstablished
StateSynSent
StateSynRecv
StateFinWait1
StateFinWait2
StateTimeWait
StateClose
StateCloseWait
StateLastAck
StateListen
StateClosing
)
// connected is the set of states where an endpoint is connected to a peer.
func (s EndpointState) connected() bool {
switch s {
case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing:
return true
default:
return false
}
}
// String implements fmt.Stringer.String.
func (s EndpointState) String() string {
switch s {
case StateInitial:
return "INITIAL"
case StateBound:
return "BOUND"
case StateConnecting:
return "CONNECTING"
case StateError:
return "ERROR"
case StateEstablished:
return "ESTABLISHED"
case StateSynSent:
return "SYN-SENT"
case StateSynRecv:
return "SYN-RCVD"
case StateFinWait1:
return "FIN-WAIT1"
case StateFinWait2:
return "FIN-WAIT2"
case StateTimeWait:
return "TIME-WAIT"
case StateClose:
return "CLOSED"
case StateCloseWait:
return "CLOSE-WAIT"
case StateLastAck:
return "LAST-ACK"
case StateListen:
return "LISTEN"
case StateClosing:
return "CLOSING"
default:
panic("unreachable")
}
}
// Reasons for notifying the protocol goroutine.
const (
notifyNonZeroReceiveWindow = 1 << iota
notifyReceiveWindowChanged
notifyClose
notifyMTUChanged
notifyDrain
notifyReset
notifyKeepaliveChanged
notifyMSSChanged
)
// SACKInfo holds TCP SACK related information for a given endpoint.
//
// +stateify savable
type SACKInfo struct {
// Blocks is the maximum number of SACK blocks we track
// per endpoint.
Blocks [MaxSACKBlocks]header.SACKBlock
// NumBlocks is the number of valid SACK blocks stored in the
// blocks array above.
NumBlocks int
}
// rcvBufAutoTuneParams are used to hold state variables to compute
// the auto tuned recv buffer size.
//
// +stateify savable
type rcvBufAutoTuneParams struct {
// measureTime is the time at which the current measurement
// was started.
measureTime time.Time
// copied is the number of bytes copied out of the receive
// buffers since this measure began.
copied int
// prevCopied is the number of bytes copied out of the receive
// buffers in the previous RTT period.
prevCopied int
// rtt is the non-smoothed minimum RTT as measured by observing the time
// between when a byte is first acknowledged and the receipt of data
// that is at least one window beyond the sequence number that was
// acknowledged.
rtt time.Duration
// rttMeasureSeqNumber is the highest acceptable sequence number at the
// time this RTT measurement period began.
rttMeasureSeqNumber seqnum.Value
// rttMeasureTime is the absolute time at which the current rtt
// measurement period began.
rttMeasureTime time.Time
// disabled is true if an explicit receive buffer is set for the
// endpoint.
disabled bool
}
// ReceiveErrors collect segment receive errors within transport layer.
type ReceiveErrors struct {
tcpip.ReceiveErrors
// SegmentQueueDropped is the number of segments dropped due to
// a full segment queue.
SegmentQueueDropped tcpip.StatCounter
// ChecksumErrors is the number of segments dropped due to bad checksums.
ChecksumErrors tcpip.StatCounter
// ListenOverflowSynDrop is the number of times the listen queue overflowed
// and a SYN was dropped.
ListenOverflowSynDrop tcpip.StatCounter
// ListenOverflowAckDrop is the number of times the final ACK
// in the handshake was dropped due to overflow.
ListenOverflowAckDrop tcpip.StatCounter
// ZeroRcvWindowState is the number of times we advertised
// a zero receive window when rcvList is full.
ZeroRcvWindowState tcpip.StatCounter
}
// SendErrors collect segment send errors within the transport layer.
type SendErrors struct {
tcpip.SendErrors
// SegmentSendToNetworkFailed is the number of TCP segments failed to be sent
// to the network endpoint.
SegmentSendToNetworkFailed tcpip.StatCounter
// SynSendToNetworkFailed is the number of TCP SYNs failed to be sent
// to the network endpoint.
SynSendToNetworkFailed tcpip.StatCounter
// Retransmits is the number of TCP segments retransmitted.
Retransmits tcpip.StatCounter
// FastRetransmit is the number of segments retransmitted in fast
// recovery.
FastRetransmit tcpip.StatCounter
// Timeouts is the number of times the RTO expired.
Timeouts tcpip.StatCounter
}
// Stats holds statistics about the endpoint.
type Stats struct {
// SegmentsReceived is the number of TCP segments received that
// the transport layer successfully parsed.
SegmentsReceived tcpip.StatCounter
// SegmentsSent is the number of TCP segments sent.
SegmentsSent tcpip.StatCounter
// FailedConnectionAttempts is the number of times we saw Connect and
// Accept errors.
FailedConnectionAttempts tcpip.StatCounter
// ReceiveErrors collects segment receive errors within the
// transport layer.
ReceiveErrors ReceiveErrors
// ReadErrors collects segment read errors from an endpoint read call.
ReadErrors tcpip.ReadErrors
// SendErrors collects segment send errors within the transport layer.
SendErrors SendErrors
// WriteErrors collects segment write errors from an endpoint write call.
WriteErrors tcpip.WriteErrors
}
// IsEndpointStats is an empty method to implement the tcpip.EndpointStats
// marker interface.
func (*Stats) IsEndpointStats() {}
// EndpointInfo holds useful information about a transport endpoint which
// can be queried by monitoring tools.
//
// +stateify savable
type EndpointInfo struct {
stack.TransportEndpointInfo
// HardError is meaningful only when state is stateError. It stores the
// error to be returned when read/write syscalls are called and the
// endpoint is in this state. HardError is protected by endpoint mu.
HardError *tcpip.Error
}
// IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo
// marker interface.
func (*EndpointInfo) IsEndpointInfo() {}
// endpoint represents a TCP endpoint. This struct serves as the interface
// between users of the endpoint and the protocol implementation; it is legal to
// have concurrent goroutines make calls into the endpoint, they are properly
// synchronized. The protocol implementation, however, runs in a single
// goroutine.
//
// +stateify savable
type endpoint struct {
EndpointInfo
// workMu is used to arbitrate which goroutine may perform protocol
// work. Only the main protocol goroutine is expected to call Lock() on
// it, but other goroutines (e.g., send) may call TryLock() to eagerly
// perform work without having to wait for the main one to wake up.
workMu tmutex.Mutex
// The following fields are initialized at creation time and do not
// change throughout the lifetime of the endpoint.
stack *stack.Stack
waiterQueue *waiter.Queue
// lastError represents the last error that the endpoint reported;
// access to it is protected by the following mutex.
lastErrorMu sync.Mutex
lastError *tcpip.Error
// The following fields are used to manage the receive queue. The
// protocol goroutine adds ready-for-delivery segments to rcvList,
// which are returned by Read() calls to users.
//
// Once the peer has closed its send side, rcvClosed is set to true
// to indicate to users that no more data is coming.
//
// rcvListMu can be taken after the endpoint mu below.
rcvListMu sync.Mutex
rcvList segmentList
rcvClosed bool
rcvBufSize int
rcvBufUsed int
rcvAutoParams rcvBufAutoTuneParams
// zeroWindow indicates that the window was closed due to receive buffer
// space being filled up. This is set by the worker goroutine before
// moving a segment to the rcvList. This setting is cleared by the
// endpoint when a Read() call reads enough data for the new window to
// be non-zero.
zeroWindow bool
// The following fields are protected by the mutex.
mu sync.RWMutex
state EndpointState
isPortReserved bool
isRegistered bool
boundNICID tcpip.NICID
route stack.Route
ttl uint8
v6only bool
isConnectNotified bool
// TCP should never broadcast but Linux nevertheless supports enabling/
// disabling SO_BROADCAST, albeit as a NOOP.
broadcast bool
// effectiveNetProtos contains the network protocols actually in use. In
// most cases it will only contain "netProto", but in cases like IPv6
// endpoints with v6only set to false, this could include multiple
// protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g.,
// IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped
// address).
effectiveNetProtos []tcpip.NetworkProtocolNumber
// workerRunning specifies if a worker goroutine is running.
workerRunning bool
// workerCleanup specifies if the worker goroutine must perform cleanup
// before exitting. This can only be set to true when workerRunning is
// also true, and they're both protected by the mutex.
workerCleanup bool
// sendTSOk is used to indicate when the TS Option has been negotiated.
// When sendTSOk is true every non-RST segment should carry a TS as per
// RFC7323#section-1.1
sendTSOk bool
// recentTS is the timestamp that should be sent in the TSEcr field of
// the timestamp for future segments sent by the endpoint. This field is
// updated if required when a new segment is received by this endpoint.
recentTS uint32
// tsOffset is a randomized offset added to the value of the
// TSVal field in the timestamp option.
tsOffset uint32
// shutdownFlags represent the current shutdown state of the endpoint.
shutdownFlags tcpip.ShutdownFlags
// sackPermitted is set to true if the peer sends the TCPSACKPermitted
// option in the SYN/SYN-ACK.
sackPermitted bool
// sack holds TCP SACK related information for this endpoint.
sack SACKInfo
// reusePort is set to true if SO_REUSEPORT is enabled.
reusePort bool
// bindToDevice is set to the NIC on which to bind or disabled if 0.
bindToDevice tcpip.NICID
// delay enables Nagle's algorithm.
//
// delay is a boolean (0 is false) and must be accessed atomically.
delay uint32
// cork holds back segments until full.
//
// cork is a boolean (0 is false) and must be accessed atomically.
cork uint32
// scoreboard holds TCP SACK Scoreboard information for this endpoint.
scoreboard *SACKScoreboard
// The options below aren't implemented, but we remember the user
// settings because applications expect to be able to set/query these
// options.
reuseAddr bool
// slowAck holds the negated state of quick ack. It is stubbed out and
// does nothing.
//
// slowAck is a boolean (0 is false) and must be accessed atomically.
slowAck uint32
// segmentQueue is used to hand received segments to the protocol
// goroutine. Segments are queued as long as the queue is not full,
// and dropped when it is.
segmentQueue segmentQueue
// synRcvdCount is the number of connections for this endpoint that are
// in SYN-RCVD state.
synRcvdCount int
// userMSS if non-zero is the MSS value explicitly set by the user
// for this endpoint using the TCP_MAXSEG setsockopt.
userMSS int
// The following fields are used to manage the send buffer. When
// segments are ready to be sent, they are added to sndQueue and the
// protocol goroutine is signaled via sndWaker.
//
// When the send side is closed, the protocol goroutine is notified via
// sndCloseWaker, and sndClosed is set to true.
sndBufMu sync.Mutex
sndBufSize int
sndBufUsed int
sndClosed bool
sndBufInQueue seqnum.Size
sndQueue segmentList
sndWaker sleep.Waker
sndCloseWaker sleep.Waker
// cc stores the name of the Congestion Control algorithm to use for
// this endpoint.
cc tcpip.CongestionControlOption
// The following are used when a "packet too big" control packet is
// received. They are protected by sndBufMu. They are used to
// communicate to the main protocol goroutine how many such control
// messages have been received since the last notification was processed
// and what was the smallest MTU seen.
packetTooBigCount int
sndMTU int
// newSegmentWaker is used to indicate to the protocol goroutine that
// it needs to wake up and handle new segments queued to it.
newSegmentWaker sleep.Waker
// notificationWaker is used to indicate to the protocol goroutine that
// it needs to wake up and check for notifications.
notificationWaker sleep.Waker
// notifyFlags is a bitmask of flags used to indicate to the protocol
// goroutine what it was notified; this is only accessed atomically.
notifyFlags uint32
// keepalive manages TCP keepalive state. When the connection is idle
// (no data sent or received) for keepaliveIdle, we start sending
// keepalives every keepalive.interval. If we send keepalive.count
// without hearing a response, the connection is closed.
keepalive keepalive
// pendingAccepted is a synchronization primitive used to track number
// of connections that are queued up to be delivered to the accepted
// channel. We use this to ensure that all goroutines blocked on writing
// to the acceptedChan below terminate before we close acceptedChan.
pendingAccepted sync.WaitGroup
// acceptedChan is used by a listening endpoint protocol goroutine to
// send newly accepted connections to the endpoint so that they can be
// read by Accept() calls.
acceptedChan chan *endpoint
// The following are only used from the protocol goroutine, and
// therefore don't need locks to protect them.
rcv *receiver
snd *sender
// The goroutine drain completion notification channel.
drainDone chan struct{}
// The goroutine undrain notification channel. This is currently used as
// a way to block the worker goroutines. Today nothing closes/writes
// this channel and this causes any goroutines waiting on this to just
// block. This is used during save/restore to prevent worker goroutines
// from mutating state as it's being saved.
undrain chan struct{}
// probe if not nil is invoked on every received segment. It is passed
// a copy of the current state of the endpoint.
probe stack.TCPProbeFunc
// The following are only used to assist the restore run to re-connect.
connectingAddress tcpip.Address
// amss is the advertised MSS to the peer by this endpoint.
amss uint16
// sendTOS represents IPv4 TOS or IPv6 TrafficClass,
// applied while sending packets. Defaults to 0 as on Linux.
sendTOS uint8
gso *stack.GSO
// TODO(b/142022063): Add ability to save and restore per endpoint stats.
stats Stats
}
// StopWork halts packet processing. Only to be used in tests.
func (e *endpoint) StopWork() {
e.workMu.Lock()
}
// ResumeWork resumes packet processing. Only to be used in tests.
func (e *endpoint) ResumeWork() {
e.workMu.Unlock()
}
// keepalive is a synchronization wrapper used to appease stateify. See the
// comment in endpoint, where it is used.
//
// +stateify savable
type keepalive struct {
sync.Mutex
enabled bool
idle time.Duration
interval time.Duration
count int
unacked int
timer timer
waker sleep.Waker
}
func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint {
e := &endpoint{
stack: s,
EndpointInfo: EndpointInfo{
TransportEndpointInfo: stack.TransportEndpointInfo{
NetProto: netProto,
TransProto: header.TCPProtocolNumber,
},
},
waiterQueue: waiterQueue,
state: StateInitial,
rcvBufSize: DefaultReceiveBufferSize,
sndBufSize: DefaultSendBufferSize,
sndMTU: int(math.MaxInt32),
reuseAddr: true,
keepalive: keepalive{
// Linux defaults.
idle: 2 * time.Hour,
interval: 75 * time.Second,
count: 9,
},
}
var ss SendBufferSizeOption
if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
e.sndBufSize = ss.Default
}
var rs ReceiveBufferSizeOption
if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
e.rcvBufSize = rs.Default
}
var cs tcpip.CongestionControlOption
if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil {
e.cc = cs
}
var mrb tcpip.ModerateReceiveBufferOption
if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil {
e.rcvAutoParams.disabled = !bool(mrb)
}
if p := s.GetTCPProbe(); p != nil {
e.probe = p
}
e.segmentQueue.setLimit(MaxUnprocessedSegments)
e.workMu.Init()
e.workMu.Lock()
e.tsOffset = timeStampOffset()
return e
}
// Readiness returns the current readiness of the endpoint. For example, if
// waiter.EventIn is set, the endpoint is immediately readable.
func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask {
result := waiter.EventMask(0)
e.mu.RLock()
defer e.mu.RUnlock()
switch e.state {
case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv:
// Ready for nothing.
case StateClose, StateError:
// Ready for anything.
result = mask
case StateListen:
// Check if there's anything in the accepted channel.
if (mask & waiter.EventIn) != 0 {
if len(e.acceptedChan) > 0 {
result |= waiter.EventIn
}
}
}
if e.state.connected() {
// Determine if the endpoint is writable if requested.
if (mask & waiter.EventOut) != 0 {
e.sndBufMu.Lock()
if e.sndClosed || e.sndBufUsed < e.sndBufSize {
result |= waiter.EventOut
}
e.sndBufMu.Unlock()
}
// Determine if the endpoint is readable if requested.
if (mask & waiter.EventIn) != 0 {
e.rcvListMu.Lock()
if e.rcvBufUsed > 0 || e.rcvClosed {
result |= waiter.EventIn
}
e.rcvListMu.Unlock()
}
}
return result
}
func (e *endpoint) fetchNotifications() uint32 {
return atomic.SwapUint32(&e.notifyFlags, 0)
}
func (e *endpoint) notifyProtocolGoroutine(n uint32) {
for {
v := atomic.LoadUint32(&e.notifyFlags)
if v&n == n {
// The flags are already set.
return
}
if atomic.CompareAndSwapUint32(&e.notifyFlags, v, v|n) {
if v == 0 {
// We are causing a transition from no flags to
// at least one flag set, so we must cause the
// protocol goroutine to wake up.
e.notificationWaker.Assert()
}
return
}
}
}
// Close puts the endpoint in a closed state and frees all resources associated
// with it. It must be called only once and with no other concurrent calls to
// the endpoint.
func (e *endpoint) Close() {
// Issue a shutdown so that the peer knows we won't send any more data
// if we're connected, or stop accepting if we're listening.
e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead)
e.mu.Lock()
// For listening sockets, we always release ports inline so that they
// are immediately available for reuse after Close() is called. If also
// registered, we unregister as well otherwise the next user would fail
// in Listen() when trying to register.
if e.state == StateListen && e.isPortReserved {
if e.isRegistered {
e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice)
e.isRegistered = false
}
e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.bindToDevice)
e.isPortReserved = false
}
// Either perform the local cleanup or kick the worker to make sure it
// knows it needs to cleanup.
tcpip.AddDanglingEndpoint(e)
if !e.workerRunning {
e.cleanupLocked()
} else {
e.workerCleanup = true
e.notifyProtocolGoroutine(notifyClose)
}
e.mu.Unlock()
}
// closePendingAcceptableConnections closes all connections that have completed
// handshake but not yet been delivered to the application.
func (e *endpoint) closePendingAcceptableConnectionsLocked() {
done := make(chan struct{})
// Spin a goroutine up as ranging on e.acceptedChan will just block when
// there are no more connections in the channel. Using a non-blocking
// select does not work as it can potentially select the default case
// even when there are pending writes but that are not yet written to
// the channel.
go func() {
defer close(done)
for n := range e.acceptedChan {
n.mu.Lock()
n.resetConnectionLocked(tcpip.ErrConnectionAborted)
n.mu.Unlock()
n.Close()
}
}()
// pendingAccepted(see endpoint.deliverAccepted) tracks the number of
// endpoints which have completed handshake but are not yet written to
// the e.acceptedChan. We wait here till the goroutine above can drain
// all such connections from e.acceptedChan.
e.pendingAccepted.Wait()
close(e.acceptedChan)
<-done
e.acceptedChan = nil
}
// cleanupLocked frees all resources associated with the endpoint. It is called
// after Close() is called and the worker goroutine (if any) is done with its
// work.
func (e *endpoint) cleanupLocked() {
// Close all endpoints that might have been accepted by TCP but not by
// the client.
if e.acceptedChan != nil {
e.closePendingAcceptableConnectionsLocked()
}
e.workerCleanup = false
if e.isRegistered {
e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice)
e.isRegistered = false
}
if e.isPortReserved {
e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.bindToDevice)
e.isPortReserved = false
}
e.route.Release()
tcpip.DeleteDanglingEndpoint(e)
}
// initialReceiveWindow returns the initial receive window to advertise in the
// SYN/SYN-ACK.
func (e *endpoint) initialReceiveWindow() int {
rcvWnd := e.receiveBufferAvailable()
if rcvWnd > math.MaxUint16 {
rcvWnd = math.MaxUint16
}
routeWnd := InitialCwnd * int(mssForRoute(&e.route)) * 2
if rcvWnd > routeWnd {
rcvWnd = routeWnd
}
return rcvWnd
}
// ModerateRecvBuf adjusts the receive buffer and the advertised window
// based on the number of bytes copied to user space.
func (e *endpoint) ModerateRecvBuf(copied int) {
e.rcvListMu.Lock()
if e.rcvAutoParams.disabled {
e.rcvListMu.Unlock()
return
}
now := time.Now()
if rtt := e.rcvAutoParams.rtt; rtt == 0 || now.Sub(e.rcvAutoParams.measureTime) < rtt {
e.rcvAutoParams.copied += copied
e.rcvListMu.Unlock()
return
}
prevRTTCopied := e.rcvAutoParams.copied + copied
prevCopied := e.rcvAutoParams.prevCopied
rcvWnd := 0
if prevRTTCopied > prevCopied {
// The minimal receive window based on what was copied by the app
// in the immediate preceding RTT and some extra buffer for 16
// segments to account for variations.
// We multiply by 2 to account for packet losses.
rcvWnd = prevRTTCopied*2 + 16*int(e.amss)
// Scale for slow start based on bytes copied in this RTT vs previous.
grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied
// Multiply growth factor by 2 again to account for sender being
// in slow-start where the sender grows it's congestion window
// by 100% per RTT.
rcvWnd += grow * 2
// Make sure auto tuned buffer size can always receive upto 2x
// the initial window of 10 segments.
if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd {
rcvWnd = minRcvWnd
}
// Cap the auto tuned buffer size by the maximum permissible
// receive buffer size.
if max := e.maxReceiveBufferSize(); rcvWnd > max {
rcvWnd = max
}
// We do not adjust downwards as that can cause the receiver to
// reject valid data that might already be in flight as the
// acceptable window will shrink.
if rcvWnd > e.rcvBufSize {
e.rcvBufSize = rcvWnd
e.notifyProtocolGoroutine(notifyReceiveWindowChanged)
}
// We only update prevCopied when we grow the buffer because in cases
// where prevCopied > prevRTTCopied the existing buffer is already big
// enough to handle the current rate and we don't need to do any
// adjustments.
e.rcvAutoParams.prevCopied = prevRTTCopied
}
e.rcvAutoParams.measureTime = now
e.rcvAutoParams.copied = 0
e.rcvListMu.Unlock()
}
// IPTables implements tcpip.Endpoint.IPTables.
func (e *endpoint) IPTables() (iptables.IPTables, error) {
return e.stack.IPTables(), nil
}
// Read reads data from the endpoint.
func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) {
e.mu.RLock()
// The endpoint can be read if it's connected, or if it's already closed
// but has some pending unread data. Also note that a RST being received
// would cause the state to become StateError so we should allow the
// reads to proceed before returning a ECONNRESET.
e.rcvListMu.Lock()
bufUsed := e.rcvBufUsed
if s := e.state; !s.connected() && s != StateClose && bufUsed == 0 {
e.rcvListMu.Unlock()
he := e.HardError
e.mu.RUnlock()
if s == StateError {
return buffer.View{}, tcpip.ControlMessages{}, he
}
e.stats.ReadErrors.InvalidEndpointState.Increment()
return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
}
v, err := e.readLocked()
e.rcvListMu.Unlock()
e.mu.RUnlock()
if err == tcpip.ErrClosedForReceive {
e.stats.ReadErrors.ReadClosed.Increment()
}
return v, tcpip.ControlMessages{}, err
}
func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) {
if e.rcvBufUsed == 0 {
if e.rcvClosed || !e.state.connected() {
return buffer.View{}, tcpip.ErrClosedForReceive
}
return buffer.View{}, tcpip.ErrWouldBlock
}
s := e.rcvList.Front()
views := s.data.Views()
v := views[s.viewToDeliver]
s.viewToDeliver++
if s.viewToDeliver >= len(views) {
e.rcvList.Remove(s)
s.decRef()
}
e.rcvBufUsed -= len(v)
// If the window was zero before this read and if the read freed up
// enough buffer space for the scaled window to be non-zero then notify
// the protocol goroutine to send a window update.
if e.zeroWindow && !e.zeroReceiveWindow(e.rcv.rcvWndScale) {
e.zeroWindow = false
e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow)
}
return v, nil
}
// isEndpointWritableLocked checks if a given endpoint is writable
// and also returns the number of bytes that can be written at this
// moment. If the endpoint is not writable then it returns an error
// indicating the reason why it's not writable.
// Caller must hold e.mu and e.sndBufMu
func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) {
// The endpoint cannot be written to if it's not connected.
if !e.state.connected() {
switch e.state {
case StateError:
return 0, e.HardError
default:
return 0, tcpip.ErrClosedForSend
}
}
// Check if the connection has already been closed for sends.
if e.sndClosed {
return 0, tcpip.ErrClosedForSend
}
avail := e.sndBufSize - e.sndBufUsed
if avail <= 0 {
return 0, tcpip.ErrWouldBlock
}
return avail, nil
}
// Write writes data to the endpoint's peer.
func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) {
// Linux completely ignores any address passed to sendto(2) for TCP sockets
// (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More
// and opts.EndOfRecord are also ignored.
e.mu.RLock()
e.sndBufMu.Lock()
avail, err := e.isEndpointWritableLocked()
if err != nil {
e.sndBufMu.Unlock()
e.mu.RUnlock()
e.stats.WriteErrors.WriteClosed.Increment()
return 0, nil, err
}
// We can release locks while copying data.
//
// This is not possible if atomic is set, because we can't allow the
// available buffer space to be consumed by some other caller while we
// are copying data in.
if !opts.Atomic {
e.sndBufMu.Unlock()
e.mu.RUnlock()
}
// Fetch data.
v, perr := p.Payload(avail)
if perr != nil || len(v) == 0 {
if opts.Atomic { // See above.
e.sndBufMu.Unlock()
e.mu.RUnlock()
}
// Note that perr may be nil if len(v) == 0.
return 0, nil, perr
}
if !opts.Atomic { // See above.
e.mu.RLock()
e.sndBufMu.Lock()
// Because we released the lock before copying, check state again
// to make sure the endpoint is still in a valid state for a write.
avail, err = e.isEndpointWritableLocked()
if err != nil {
e.sndBufMu.Unlock()
e.mu.RUnlock()
e.stats.WriteErrors.WriteClosed.Increment()
return 0, nil, err
}
// Discard any excess data copied in due to avail being reduced due
// to a simultaneous write call to the socket.
if avail < len(v) {
v = v[:avail]
}
}
// Add data to the send queue.
s := newSegmentFromView(&e.route, e.ID, v)
e.sndBufUsed += len(v)
e.sndBufInQueue += seqnum.Size(len(v))
e.sndQueue.PushBack(s)
e.sndBufMu.Unlock()
// Release the endpoint lock to prevent deadlocks due to lock
// order inversion when acquiring workMu.
e.mu.RUnlock()
if e.workMu.TryLock() {
// Do the work inline.
e.handleWrite()
e.workMu.Unlock()
} else {
// Let the protocol goroutine do the work.
e.sndWaker.Assert()
}
return int64(len(v)), nil, nil
}
// Peek reads data without consuming it from the endpoint.
//
// This method does not block if there is no data pending.
func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) {
e.mu.RLock()
defer e.mu.RUnlock()
// The endpoint can be read if it's connected, or if it's already closed
// but has some pending unread data.
if s := e.state; !s.connected() && s != StateClose {
if s == StateError {
return 0, tcpip.ControlMessages{}, e.HardError
}
e.stats.ReadErrors.InvalidEndpointState.Increment()
return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState
}
e.rcvListMu.Lock()
defer e.rcvListMu.Unlock()
if e.rcvBufUsed == 0 {
if e.rcvClosed || !e.state.connected() {
e.stats.ReadErrors.ReadClosed.Increment()
return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive
}
return 0, tcpip.ControlMessages{}, tcpip.ErrWouldBlock
}
// Make a copy of vec so we can modify the slide headers.
vec = append([][]byte(nil), vec...)
var num int64
for s := e.rcvList.Front(); s != nil; s = s.Next() {
views := s.data.Views()
for i := s.viewToDeliver; i < len(views); i++ {
v := views[i]
for len(v) > 0 {
if len(vec) == 0 {
return num, tcpip.ControlMessages{}, nil
}
if len(vec[0]) == 0 {
vec = vec[1:]
continue
}
n := copy(vec[0], v)
v = v[n:]
vec[0] = vec[0][n:]
num += int64(n)
}
}
}
return num, tcpip.ControlMessages{}, nil
}
// zeroReceiveWindow checks if the receive window to be announced now would be
// zero, based on the amount of available buffer and the receive window scaling.
//
// It must be called with rcvListMu held.
func (e *endpoint) zeroReceiveWindow(scale uint8) bool {
if e.rcvBufUsed >= e.rcvBufSize {
return true
}
return ((e.rcvBufSize - e.rcvBufUsed) >> scale) == 0
}
// SetSockOptInt sets a socket option.
func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error {
switch opt {
case tcpip.ReceiveBufferSizeOption:
// Make sure the receive buffer size is within the min and max
// allowed.
var rs ReceiveBufferSizeOption
size := int(v)
if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil {
if size < rs.Min {
size = rs.Min
}
if size > rs.Max {
size = rs.Max
}
}
mask := uint32(notifyReceiveWindowChanged)
e.rcvListMu.Lock()
// Make sure the receive buffer size allows us to send a
// non-zero window size.
scale := uint8(0)
if e.rcv != nil {
scale = e.rcv.rcvWndScale
}
if size>>scale == 0 {
size = 1 << scale
}
// Make sure 2*size doesn't overflow.
if size > math.MaxInt32/2 {
size = math.MaxInt32 / 2
}
e.rcvBufSize = size
e.rcvAutoParams.disabled = true
if e.zeroWindow && !e.zeroReceiveWindow(scale) {
e.zeroWindow = false
mask |= notifyNonZeroReceiveWindow
}
e.rcvListMu.Unlock()
e.notifyProtocolGoroutine(mask)
return nil
case tcpip.SendBufferSizeOption:
// Make sure the send buffer size is within the min and max
// allowed.
size := int(v)
var ss SendBufferSizeOption
if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil {
if size < ss.Min {
size = ss.Min
}
if size > ss.Max {
size = ss.Max
}
}
e.sndBufMu.Lock()
e.sndBufSize = size
e.sndBufMu.Unlock()
return nil
default:
return nil
}
}
// SetSockOpt sets a socket option.
func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error {
// Lower 2 bits represents ECN bits. RFC 3168, section 23.1
const inetECNMask = 3
switch v := opt.(type) {
case tcpip.DelayOption:
if v == 0 {
atomic.StoreUint32(&e.delay, 0)
// Handle delayed data.
e.sndWaker.Assert()
} else {
atomic.StoreUint32(&e.delay, 1)
}
return nil
case tcpip.CorkOption:
if v == 0 {
atomic.StoreUint32(&e.cork, 0)
// Handle the corked data.
e.sndWaker.Assert()
} else {
atomic.StoreUint32(&e.cork, 1)
}
return nil
case tcpip.ReuseAddressOption:
e.mu.Lock()
e.reuseAddr = v != 0
e.mu.Unlock()
return nil
case tcpip.ReusePortOption:
e.mu.Lock()
e.reusePort = v != 0
e.mu.Unlock()
return nil
case tcpip.BindToDeviceOption:
e.mu.Lock()
defer e.mu.Unlock()
if v == "" {
e.bindToDevice = 0
return nil
}
for nicid, nic := range e.stack.NICInfo() {
if nic.Name == string(v) {
e.bindToDevice = nicid
return nil
}
}
return tcpip.ErrUnknownDevice
case tcpip.QuickAckOption:
if v == 0 {
atomic.StoreUint32(&e.slowAck, 1)
} else {
atomic.StoreUint32(&e.slowAck, 0)
}
return nil
case tcpip.MaxSegOption:
userMSS := v
if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS {
return tcpip.ErrInvalidOptionValue
}
e.mu.Lock()
e.userMSS = int(userMSS)
e.mu.Unlock()
e.notifyProtocolGoroutine(notifyMSSChanged)
return nil
case tcpip.V6OnlyOption:
// We only recognize this option on v6 endpoints.
if e.NetProto != header.IPv6ProtocolNumber {
return tcpip.ErrInvalidEndpointState
}
e.mu.Lock()
defer e.mu.Unlock()
// We only allow this to be set when we're in the initial state.
if e.state != StateInitial {
return tcpip.ErrInvalidEndpointState
}
e.v6only = v != 0
return nil
case tcpip.TTLOption:
e.mu.Lock()
e.ttl = uint8(v)
e.mu.Unlock()
return nil
case tcpip.KeepaliveEnabledOption:
e.keepalive.Lock()
e.keepalive.enabled = v != 0
e.keepalive.Unlock()
e.notifyProtocolGoroutine(notifyKeepaliveChanged)
return nil
case tcpip.KeepaliveIdleOption:
e.keepalive.Lock()
e.keepalive.idle = time.Duration(v)
e.keepalive.Unlock()
e.notifyProtocolGoroutine(notifyKeepaliveChanged)
return nil
case tcpip.KeepaliveIntervalOption:
e.keepalive.Lock()
e.keepalive.interval = time.Duration(v)
e.keepalive.Unlock()
e.notifyProtocolGoroutine(notifyKeepaliveChanged)
return nil
case tcpip.KeepaliveCountOption:
e.keepalive.Lock()
e.keepalive.count = int(v)
e.keepalive.Unlock()
e.notifyProtocolGoroutine(notifyKeepaliveChanged)
return nil
case tcpip.BroadcastOption:
e.mu.Lock()
e.broadcast = v != 0
e.mu.Unlock()
return nil
case tcpip.CongestionControlOption:
// Query the available cc algorithms in the stack and
// validate that the specified algorithm is actually
// supported in the stack.
var avail tcpip.AvailableCongestionControlOption
if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil {
return err
}
availCC := strings.Split(string(avail), " ")
for _, cc := range availCC {
if v == tcpip.CongestionControlOption(cc) {
// Acquire the work mutex as we may need to
// reinitialize the congestion control state.
e.mu.Lock()
state := e.state
e.cc = v
e.mu.Unlock()
switch state {
case StateEstablished:
e.workMu.Lock()
e.mu.Lock()
if e.state == state {
e.snd.cc = e.snd.initCongestionControl(e.cc)
}
e.mu.Unlock()
e.workMu.Unlock()
}
return nil
}
}
// Linux returns ENOENT when an invalid congestion
// control algorithm is specified.
return tcpip.ErrNoSuchFile
case tcpip.IPv4TOSOption:
e.mu.Lock()
// TODO(gvisor.dev/issue/995): ECN is not currently supported,
// ignore the bits for now.
e.sendTOS = uint8(v) & ^uint8(inetECNMask)
e.mu.Unlock()
return nil
case tcpip.IPv6TrafficClassOption:
e.mu.Lock()
// TODO(gvisor.dev/issue/995): ECN is not currently supported,
// ignore the bits for now.
e.sendTOS = uint8(v) & ^uint8(inetECNMask)
e.mu.Unlock()
return nil
default:
return nil
}
}
// readyReceiveSize returns the number of bytes ready to be received.
func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) {
e.mu.RLock()
defer e.mu.RUnlock()
// The endpoint cannot be in listen state.
if e.state == StateListen {
return 0, tcpip.ErrInvalidEndpointState
}
e.rcvListMu.Lock()
defer e.rcvListMu.Unlock()
return e.rcvBufUsed, nil
}
// GetSockOptInt implements tcpip.Endpoint.GetSockOptInt.
func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) {
switch opt {
case tcpip.ReceiveQueueSizeOption:
return e.readyReceiveSize()
case tcpip.SendBufferSizeOption:
e.sndBufMu.Lock()
v := e.sndBufSize
e.sndBufMu.Unlock()
return v, nil
case tcpip.ReceiveBufferSizeOption:
e.rcvListMu.Lock()
v := e.rcvBufSize
e.rcvListMu.Unlock()
return v, nil
}
return -1, tcpip.ErrUnknownProtocolOption
}
// GetSockOpt implements tcpip.Endpoint.GetSockOpt.
func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error {
switch o := opt.(type) {
case tcpip.ErrorOption:
e.lastErrorMu.Lock()
err := e.lastError
e.lastError = nil
e.lastErrorMu.Unlock()
return err
case *tcpip.MaxSegOption:
// This is just stubbed out. Linux never returns the user_mss
// value as it either returns the defaultMSS or returns the
// actual current MSS. Netstack just returns the defaultMSS
// always for now.
*o = header.TCPDefaultMSS
return nil
case *tcpip.DelayOption:
*o = 0
if v := atomic.LoadUint32(&e.delay); v != 0 {
*o = 1
}
return nil
case *tcpip.CorkOption:
*o = 0
if v := atomic.LoadUint32(&e.cork); v != 0 {
*o = 1
}
return nil
case *tcpip.ReuseAddressOption:
e.mu.RLock()
v := e.reuseAddr
e.mu.RUnlock()
*o = 0
if v {
*o = 1
}
return nil
case *tcpip.ReusePortOption:
e.mu.RLock()
v := e.reusePort
e.mu.RUnlock()
*o = 0
if v {
*o = 1
}
return nil
case *tcpip.BindToDeviceOption:
e.mu.RLock()
defer e.mu.RUnlock()
if nic, ok := e.stack.NICInfo()[e.bindToDevice]; ok {
*o = tcpip.BindToDeviceOption(nic.Name)
return nil
}
*o = ""
return nil
case *tcpip.QuickAckOption:
*o = 1
if v := atomic.LoadUint32(&e.slowAck); v != 0 {
*o = 0
}
return nil
case *tcpip.V6OnlyOption:
// We only recognize this option on v6 endpoints.
if e.NetProto != header.IPv6ProtocolNumber {
return tcpip.ErrUnknownProtocolOption
}
e.mu.Lock()
v := e.v6only
e.mu.Unlock()
*o = 0
if v {
*o = 1
}
return nil
case *tcpip.TTLOption:
e.mu.Lock()
*o = tcpip.TTLOption(e.ttl)
e.mu.Unlock()
return nil
case *tcpip.TCPInfoOption:
*o = tcpip.TCPInfoOption{}
e.mu.RLock()
snd := e.snd
e.mu.RUnlock()
if snd != nil {
snd.rtt.Lock()
o.RTT = snd.rtt.srtt
o.RTTVar = snd.rtt.rttvar
snd.rtt.Unlock()
}
return nil
case *tcpip.KeepaliveEnabledOption:
e.keepalive.Lock()
v := e.keepalive.enabled
e.keepalive.Unlock()
*o = 0
if v {
*o = 1
}
return nil
case *tcpip.KeepaliveIdleOption:
e.keepalive.Lock()
*o = tcpip.KeepaliveIdleOption(e.keepalive.idle)
e.keepalive.Unlock()
return nil
case *tcpip.KeepaliveIntervalOption:
e.keepalive.Lock()
*o = tcpip.KeepaliveIntervalOption(e.keepalive.interval)
e.keepalive.Unlock()
return nil
case *tcpip.KeepaliveCountOption:
e.keepalive.Lock()
*o = tcpip.KeepaliveCountOption(e.keepalive.count)
e.keepalive.Unlock()
return nil
case *tcpip.OutOfBandInlineOption:
// We don't currently support disabling this option.
*o = 1
return nil
case *tcpip.BroadcastOption:
e.mu.Lock()
v := e.broadcast
e.mu.Unlock()
*o = 0
if v {
*o = 1
}
return nil
case *tcpip.CongestionControlOption:
e.mu.Lock()
*o = e.cc
e.mu.Unlock()
return nil
case *tcpip.IPv4TOSOption:
e.mu.RLock()
*o = tcpip.IPv4TOSOption(e.sendTOS)
e.mu.RUnlock()
return nil
case *tcpip.IPv6TrafficClassOption:
e.mu.RLock()
*o = tcpip.IPv6TrafficClassOption(e.sendTOS)
e.mu.RUnlock()
return nil
default:
return tcpip.ErrUnknownProtocolOption
}
}
func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) {
netProto := e.NetProto
if header.IsV4MappedAddress(addr.Addr) {
// Fail if using a v4 mapped address on a v6only endpoint.
if e.v6only {
return 0, tcpip.ErrNoRoute
}
netProto = header.IPv4ProtocolNumber
addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:]
if addr.Addr == header.IPv4Any {
addr.Addr = ""
}
}
// Fail if we're bound to an address length different from the one we're
// checking.
if l := len(e.ID.LocalAddress); l != 0 && len(addr.Addr) != 0 && l != len(addr.Addr) {
return 0, tcpip.ErrInvalidEndpointState
}
return netProto, nil
}
// Disconnect implements tcpip.Endpoint.Disconnect.
func (*endpoint) Disconnect() *tcpip.Error {
return tcpip.ErrNotSupported
}
// Connect connects the endpoint to its peer.
func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error {
err := e.connect(addr, true, true)
if err != nil && !err.IgnoreStats() {
e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
e.stats.FailedConnectionAttempts.Increment()
}
return err
}
// connect connects the endpoint to its peer. In the normal non-S/R case, the
// new connection is expected to run the main goroutine and perform handshake.
// In restore of previously connected endpoints, both ends will be passively
// created (so no new handshaking is done); for stack-accepted connections not
// yet accepted by the app, they are restored without running the main goroutine
// here.
func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tcpip.Error {
e.mu.Lock()
defer e.mu.Unlock()
connectingAddr := addr.Addr
netProto, err := e.checkV4Mapped(&addr)
if err != nil {
return err
}
if e.state.connected() {
// The endpoint is already connected. If caller hasn't been
// notified yet, return success.
if !e.isConnectNotified {
e.isConnectNotified = true
return nil
}
// Otherwise return that it's already connected.
return tcpip.ErrAlreadyConnected
}
nicid := addr.NIC
switch e.state {
case StateBound:
// If we're already bound to a NIC but the caller is requesting
// that we use a different one now, we cannot proceed.
if e.boundNICID == 0 {
break
}
if nicid != 0 && nicid != e.boundNICID {
return tcpip.ErrNoRoute
}
nicid = e.boundNICID
case StateInitial:
// Nothing to do. We'll eventually fill-in the gaps in the ID (if any)
// when we find a route.
case StateConnecting, StateSynSent, StateSynRecv:
// A connection request has already been issued but hasn't completed
// yet.
return tcpip.ErrAlreadyConnecting
case StateError:
return e.HardError
default:
return tcpip.ErrInvalidEndpointState
}
// Find a route to the desired destination.
r, err := e.stack.FindRoute(nicid, e.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */)
if err != nil {
return err
}
defer r.Release()
origID := e.ID
netProtos := []tcpip.NetworkProtocolNumber{netProto}
e.ID.LocalAddress = r.LocalAddress
e.ID.RemoteAddress = r.RemoteAddress
e.ID.RemotePort = addr.Port
if e.ID.LocalPort != 0 {
// The endpoint is bound to a port, attempt to register it.
err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, e.ID, e, e.reusePort, e.bindToDevice)
if err != nil {
return err
}
} else {
// The endpoint doesn't have a local port yet, so try to get
// one. Make sure that it isn't one that will result in the same
// address/port for both local and remote (otherwise this
// endpoint would be trying to connect to itself).
sameAddr := e.ID.LocalAddress == e.ID.RemoteAddress
// Calculate a port offset based on the destination IP/port and
// src IP to ensure that for a given tuple (srcIP, destIP,
// destPort) the offset used as a starting point is the same to
// ensure that we can cycle through the port space effectively.
h := jenkins.Sum32(e.stack.PortSeed())
h.Write([]byte(e.ID.LocalAddress))
h.Write([]byte(e.ID.RemoteAddress))
portBuf := make([]byte, 2)
binary.LittleEndian.PutUint16(portBuf, e.ID.RemotePort)
h.Write(portBuf)
portOffset := h.Sum32()
if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, *tcpip.Error) {
if sameAddr && p == e.ID.RemotePort {
return false, nil
}
// reusePort is false below because connect cannot reuse a port even if
// reusePort was set.
if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.ID.LocalAddress, p, false /* reusePort */, e.bindToDevice) {
return false, nil
}
id := e.ID
id.LocalPort = p
switch e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice) {
case nil:
e.ID = id
return true, nil
case tcpip.ErrPortInUse:
return false, nil
default:
return false, err
}
}); err != nil {
return err
}
}
// Remove the port reservation. This can happen when Bind is called
// before Connect: in such a case we don't want to hold on to
// reservations anymore.
if e.isPortReserved {
e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.bindToDevice)
e.isPortReserved = false
}
e.isRegistered = true
e.state = StateConnecting
e.route = r.Clone()
e.boundNICID = nicid
e.effectiveNetProtos = netProtos
e.connectingAddress = connectingAddr
e.initGSO()
// Connect in the restore phase does not perform handshake. Restore its
// connection setting here.
if !handshake {
e.segmentQueue.mu.Lock()
for _, l := range []segmentList{e.segmentQueue.list, e.sndQueue, e.snd.writeList} {
for s := l.Front(); s != nil; s = s.Next() {
s.id = e.ID
s.route = r.Clone()
e.sndWaker.Assert()
}
}
e.segmentQueue.mu.Unlock()
e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0)
e.state = StateEstablished
e.stack.Stats().TCP.CurrentEstablished.Increment()
}
if run {
e.workerRunning = true
e.stack.Stats().TCP.ActiveConnectionOpenings.Increment()
go e.protocolMainLoop(handshake)
}
return tcpip.ErrConnectStarted
}
// ConnectEndpoint is not supported.
func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error {
return tcpip.ErrInvalidEndpointState
}
// Shutdown closes the read and/or write end of the endpoint connection to its
// peer.
func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error {
e.mu.Lock()
defer e.mu.Unlock()
e.shutdownFlags |= flags
switch {
case e.state.connected():
// Close for read.
if (e.shutdownFlags & tcpip.ShutdownRead) != 0 {
// Mark read side as closed.
e.rcvListMu.Lock()
e.rcvClosed = true
rcvBufUsed := e.rcvBufUsed
e.rcvListMu.Unlock()
// If we're fully closed and we have unread data we need to abort
// the connection with a RST.
if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 {
e.notifyProtocolGoroutine(notifyReset)
return nil
}
}
// Close for write.
if (e.shutdownFlags & tcpip.ShutdownWrite) != 0 {
e.sndBufMu.Lock()
if e.sndClosed {
// Already closed.
e.sndBufMu.Unlock()
break
}
// Queue fin segment.
s := newSegmentFromView(&e.route, e.ID, nil)
e.sndQueue.PushBack(s)
e.sndBufInQueue++
// Mark endpoint as closed.
e.sndClosed = true
e.sndBufMu.Unlock()
// Tell protocol goroutine to close.
e.sndCloseWaker.Assert()
}
case e.state == StateListen:
// Tell protocolListenLoop to stop.
if flags&tcpip.ShutdownRead != 0 {
e.notifyProtocolGoroutine(notifyClose)
}
default:
return tcpip.ErrNotConnected
}
return nil
}
// Listen puts the endpoint in "listen" mode, which allows it to accept
// new connections.
func (e *endpoint) Listen(backlog int) *tcpip.Error {
err := e.listen(backlog)
if err != nil && !err.IgnoreStats() {
e.stack.Stats().TCP.FailedConnectionAttempts.Increment()
e.stats.FailedConnectionAttempts.Increment()
}
return err
}
func (e *endpoint) listen(backlog int) *tcpip.Error {
e.mu.Lock()
defer e.mu.Unlock()
// Allow the backlog to be adjusted if the endpoint is not shutting down.
// When the endpoint shuts down, it sets workerCleanup to true, and from
// that point onward, acceptedChan is the responsibility of the cleanup()
// method (and should not be touched anywhere else, including here).
if e.state == StateListen && !e.workerCleanup {
// Adjust the size of the channel iff we can fix existing
// pending connections into the new one.
if len(e.acceptedChan) > backlog {
return tcpip.ErrInvalidEndpointState
}
if cap(e.acceptedChan) == backlog {
return nil
}
origChan := e.acceptedChan
e.acceptedChan = make(chan *endpoint, backlog)
close(origChan)
for ep := range origChan {
e.acceptedChan <- ep
}
return nil
}
// Endpoint must be bound before it can transition to listen mode.
if e.state != StateBound {
e.stats.ReadErrors.InvalidEndpointState.Increment()
return tcpip.ErrInvalidEndpointState
}
// Register the endpoint.
if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.reusePort, e.bindToDevice); err != nil {
return err
}
e.isRegistered = true
e.state = StateListen
if e.acceptedChan == nil {
e.acceptedChan = make(chan *endpoint, backlog)
}
e.workerRunning = true
go e.protocolListenLoop(
seqnum.Size(e.receiveBufferAvailable()))
return nil
}
// startAcceptedLoop sets up required state and starts a goroutine with the
// main loop for accepted connections.
func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) {
e.waiterQueue = waiterQueue
e.workerRunning = true
go e.protocolMainLoop(false)
}
// Accept returns a new endpoint if a peer has established a connection
// to an endpoint previously set to listen mode.
func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) {
e.mu.RLock()
defer e.mu.RUnlock()
// Endpoint must be in listen state before it can accept connections.
if e.state != StateListen {
return nil, nil, tcpip.ErrInvalidEndpointState
}
// Get the new accepted endpoint.
var n *endpoint
select {
case n = <-e.acceptedChan:
default:
return nil, nil, tcpip.ErrWouldBlock
}
// Start the protocol goroutine.
wq := &waiter.Queue{}
n.startAcceptedLoop(wq)
e.stack.Stats().TCP.PassiveConnectionOpenings.Increment()
return n, wq, nil
}
// Bind binds the endpoint to a specific local port and optionally address.
func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) {
e.mu.Lock()
defer e.mu.Unlock()
// Don't allow binding once endpoint is not in the initial state
// anymore. This is because once the endpoint goes into a connected or
// listen state, it is already bound.
if e.state != StateInitial {
return tcpip.ErrAlreadyBound
}
e.BindAddr = addr.Addr
netProto, err := e.checkV4Mapped(&addr)
if err != nil {
return err
}
// Expand netProtos to include v4 and v6 if the caller is binding to a
// wildcard (empty) address, and this is an IPv6 endpoint with v6only
// set to false.
netProtos := []tcpip.NetworkProtocolNumber{netProto}
if netProto == header.IPv6ProtocolNumber && !e.v6only && addr.Addr == "" {
netProtos = []tcpip.NetworkProtocolNumber{
header.IPv6ProtocolNumber,
header.IPv4ProtocolNumber,
}
}
port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.reusePort, e.bindToDevice)
if err != nil {
return err
}
e.isPortReserved = true
e.effectiveNetProtos = netProtos
e.ID.LocalPort = port
// Any failures beyond this point must remove the port registration.
defer func(bindToDevice tcpip.NICID) {
if err != nil {
e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, bindToDevice)
e.isPortReserved = false
e.effectiveNetProtos = nil
e.ID.LocalPort = 0
e.ID.LocalAddress = ""
e.boundNICID = 0
}
}(e.bindToDevice)
// If an address is specified, we must ensure that it's one of our
// local addresses.
if len(addr.Addr) != 0 {
nic := e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr)
if nic == 0 {
return tcpip.ErrBadLocalAddress
}
e.boundNICID = nic
e.ID.LocalAddress = addr.Addr
}
// Mark endpoint as bound.
e.state = StateBound
return nil
}
// GetLocalAddress returns the address to which the endpoint is bound.
func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) {
e.mu.RLock()
defer e.mu.RUnlock()
return tcpip.FullAddress{
Addr: e.ID.LocalAddress,
Port: e.ID.LocalPort,
NIC: e.boundNICID,
}, nil
}
// GetRemoteAddress returns the address to which the endpoint is connected.
func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) {
e.mu.RLock()
defer e.mu.RUnlock()
if !e.state.connected() {
return tcpip.FullAddress{}, tcpip.ErrNotConnected
}
return tcpip.FullAddress{
Addr: e.ID.RemoteAddress,
Port: e.ID.RemotePort,
NIC: e.boundNICID,
}, nil
}
// HandlePacket is called by the stack when new packets arrive to this transport
// endpoint.
func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) {
s := newSegment(r, id, vv)
if !s.parse() {
e.stack.Stats().MalformedRcvdPackets.Increment()
e.stack.Stats().TCP.InvalidSegmentsReceived.Increment()
e.stats.ReceiveErrors.MalformedPacketsReceived.Increment()
s.decRef()
return
}
if !s.csumValid {
e.stack.Stats().MalformedRcvdPackets.Increment()
e.stack.Stats().TCP.ChecksumErrors.Increment()
e.stats.ReceiveErrors.ChecksumErrors.Increment()
s.decRef()
return
}
e.stack.Stats().TCP.ValidSegmentsReceived.Increment()
e.stats.SegmentsReceived.Increment()
if (s.flags & header.TCPFlagRst) != 0 {
e.stack.Stats().TCP.ResetsReceived.Increment()
}
// Send packet to worker goroutine.
if e.segmentQueue.enqueue(s) {
e.newSegmentWaker.Assert()
} else {
// The queue is full, so we drop the segment.
e.stack.Stats().DroppedPackets.Increment()
e.stats.ReceiveErrors.SegmentQueueDropped.Increment()
s.decRef()
}
}
// HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket.
func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) {
switch typ {
case stack.ControlPacketTooBig:
e.sndBufMu.Lock()
e.packetTooBigCount++
if v := int(extra); v < e.sndMTU {
e.sndMTU = v
}
e.sndBufMu.Unlock()
e.notifyProtocolGoroutine(notifyMTUChanged)
}
}
// updateSndBufferUsage is called by the protocol goroutine when room opens up
// in the send buffer. The number of newly available bytes is v.
func (e *endpoint) updateSndBufferUsage(v int) {
e.sndBufMu.Lock()
notify := e.sndBufUsed >= e.sndBufSize>>1
e.sndBufUsed -= v
// We only notify when there is half the sndBufSize available after
// a full buffer event occurs. This ensures that we don't wake up
// writers to queue just 1-2 segments and go back to sleep.
notify = notify && e.sndBufUsed < e.sndBufSize>>1
e.sndBufMu.Unlock()
if notify {
e.waiterQueue.Notify(waiter.EventOut)
}
}
// readyToRead is called by the protocol goroutine when a new segment is ready
// to be read, or when the connection is closed for receiving (in which case
// s will be nil).
func (e *endpoint) readyToRead(s *segment) {
e.rcvListMu.Lock()
if s != nil {
s.incRef()
e.rcvBufUsed += s.data.Size()
// Check if the receive window is now closed. If so make sure
// we set the zero window before we deliver the segment to ensure
// that a subsequent read of the segment will correctly trigger
// a non-zero notification.
if avail := e.receiveBufferAvailableLocked(); avail>>e.rcv.rcvWndScale == 0 {
e.stats.ReceiveErrors.ZeroRcvWindowState.Increment()
e.zeroWindow = true
}
e.rcvList.PushBack(s)
} else {
e.rcvClosed = true
}
e.rcvListMu.Unlock()
e.waiterQueue.Notify(waiter.EventIn)
}
// receiveBufferAvailableLocked calculates how many bytes are still available
// in the receive buffer.
// rcvListMu must be held when this function is called.
func (e *endpoint) receiveBufferAvailableLocked() int {
// We may use more bytes than the buffer size when the receive buffer
// shrinks.
if e.rcvBufUsed >= e.rcvBufSize {
return 0
}
return e.rcvBufSize - e.rcvBufUsed
}
// receiveBufferAvailable calculates how many bytes are still available in the
// receive buffer.
func (e *endpoint) receiveBufferAvailable() int {
e.rcvListMu.Lock()
available := e.receiveBufferAvailableLocked()
e.rcvListMu.Unlock()
return available
}
func (e *endpoint) receiveBufferSize() int {
e.rcvListMu.Lock()
size := e.rcvBufSize
e.rcvListMu.Unlock()
return size
}
func (e *endpoint) maxReceiveBufferSize() int {
var rs ReceiveBufferSizeOption
if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil {
// As a fallback return the hardcoded max buffer size.
return MaxBufferSize
}
return rs.Max
}
// rcvWndScaleForHandshake computes the receive window scale to offer to the
// peer when window scaling is enabled (true by default). If auto-tuning is
// disabled then the window scaling factor is based on the size of the
// receiveBuffer otherwise we use the max permissible receive buffer size to
// compute the scale.
func (e *endpoint) rcvWndScaleForHandshake() int {
bufSizeForScale := e.receiveBufferSize()
e.rcvListMu.Lock()
autoTuningDisabled := e.rcvAutoParams.disabled
e.rcvListMu.Unlock()
if autoTuningDisabled {
return FindWndScale(seqnum.Size(bufSizeForScale))
}
return FindWndScale(seqnum.Size(e.maxReceiveBufferSize()))
}
// updateRecentTimestamp updates the recent timestamp using the algorithm
// described in https://tools.ietf.org/html/rfc7323#section-4.3
func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) {
if e.sendTSOk && seqnum.Value(e.recentTS).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) {
e.recentTS = tsVal
}
}
// maybeEnableTimestamp marks the timestamp option enabled for this endpoint if
// the SYN options indicate that timestamp option was negotiated. It also
// initializes the recentTS with the value provided in synOpts.TSval.
func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) {
if synOpts.TS {
e.sendTSOk = true
e.recentTS = synOpts.TSVal
}
}
// timestamp returns the timestamp value to be used in the TSVal field of the
// timestamp option for outgoing TCP segments for a given endpoint.
func (e *endpoint) timestamp() uint32 {
return tcpTimeStamp(e.tsOffset)
}
// tcpTimeStamp returns a timestamp offset by the provided offset. This is
// not inlined above as it's used when SYN cookies are in use and endpoint
// is not created at the time when the SYN cookie is sent.
func tcpTimeStamp(offset uint32) uint32 {
now := time.Now()
return uint32(now.Unix()*1000+int64(now.Nanosecond()/1e6)) + offset
}
// timeStampOffset returns a randomized timestamp offset to be used when sending
// timestamp values in a timestamp option for a TCP segment.
func timeStampOffset() uint32 {
b := make([]byte, 4)
if _, err := rand.Read(b); err != nil {
panic(err)
}
// Initialize a random tsOffset that will be added to the recentTS
// everytime the timestamp is sent when the Timestamp option is enabled.
//
// See https://tools.ietf.org/html/rfc7323#section-5.4 for details on
// why this is required.
//
// NOTE: This is not completely to spec as normally this should be
// initialized in a manner analogous to how sequence numbers are
// randomized per connection basis. But for now this is sufficient.
return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
}
// maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint
// if the SYN options indicate that the SACK option was negotiated and the TCP
// stack is configured to enable TCP SACK option.
func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) {
var v SACKEnabled
if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil {
// Stack doesn't support SACK. So just return.
return
}
if bool(v) && synOpts.SACKPermitted {
e.sackPermitted = true
}
}
// maxOptionSize return the maximum size of TCP options.
func (e *endpoint) maxOptionSize() (size int) {
var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock
options := e.makeOptions(maxSackBlocks[:])
size = len(options)
putOptions(options)
return size
}
// completeState makes a full copy of the endpoint and returns it. This is used
// before invoking the probe. The state returned may not be fully consistent if
// there are intervening syscalls when the state is being copied.
func (e *endpoint) completeState() stack.TCPEndpointState {
var s stack.TCPEndpointState
s.SegTime = time.Now()
// Copy EndpointID.
e.mu.Lock()
s.ID = stack.TCPEndpointID(e.ID)
e.mu.Unlock()
// Copy endpoint rcv state.
e.rcvListMu.Lock()
s.RcvBufSize = e.rcvBufSize
s.RcvBufUsed = e.rcvBufUsed
s.RcvClosed = e.rcvClosed
s.RcvAutoParams.MeasureTime = e.rcvAutoParams.measureTime
s.RcvAutoParams.CopiedBytes = e.rcvAutoParams.copied
s.RcvAutoParams.PrevCopiedBytes = e.rcvAutoParams.prevCopied
s.RcvAutoParams.RTT = e.rcvAutoParams.rtt
s.RcvAutoParams.RTTMeasureSeqNumber = e.rcvAutoParams.rttMeasureSeqNumber
s.RcvAutoParams.RTTMeasureTime = e.rcvAutoParams.rttMeasureTime
s.RcvAutoParams.Disabled = e.rcvAutoParams.disabled
e.rcvListMu.Unlock()
// Endpoint TCP Option state.
s.SendTSOk = e.sendTSOk
s.RecentTS = e.recentTS
s.TSOffset = e.tsOffset
s.SACKPermitted = e.sackPermitted
s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks)
copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks])
s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy()
// Copy endpoint send state.
e.sndBufMu.Lock()
s.SndBufSize = e.sndBufSize
s.SndBufUsed = e.sndBufUsed
s.SndClosed = e.sndClosed
s.SndBufInQueue = e.sndBufInQueue
s.PacketTooBigCount = e.packetTooBigCount
s.SndMTU = e.sndMTU
e.sndBufMu.Unlock()
// Copy receiver state.
s.Receiver = stack.TCPReceiverState{
RcvNxt: e.rcv.rcvNxt,
RcvAcc: e.rcv.rcvAcc,
RcvWndScale: e.rcv.rcvWndScale,
PendingBufUsed: e.rcv.pendingBufUsed,
PendingBufSize: e.rcv.pendingBufSize,
}
// Copy sender state.
s.Sender = stack.TCPSenderState{
LastSendTime: e.snd.lastSendTime,
DupAckCount: e.snd.dupAckCount,
FastRecovery: stack.TCPFastRecoveryState{
Active: e.snd.fr.active,
First: e.snd.fr.first,
Last: e.snd.fr.last,
MaxCwnd: e.snd.fr.maxCwnd,
HighRxt: e.snd.fr.highRxt,
RescueRxt: e.snd.fr.rescueRxt,
},
SndCwnd: e.snd.sndCwnd,
Ssthresh: e.snd.sndSsthresh,
SndCAAckCount: e.snd.sndCAAckCount,
Outstanding: e.snd.outstanding,
SndWnd: e.snd.sndWnd,
SndUna: e.snd.sndUna,
SndNxt: e.snd.sndNxt,
RTTMeasureSeqNum: e.snd.rttMeasureSeqNum,
RTTMeasureTime: e.snd.rttMeasureTime,
Closed: e.snd.closed,
RTO: e.snd.rto,
MaxPayloadSize: e.snd.maxPayloadSize,
SndWndScale: e.snd.sndWndScale,
MaxSentAck: e.snd.maxSentAck,
}
e.snd.rtt.Lock()
s.Sender.SRTT = e.snd.rtt.srtt
s.Sender.SRTTInited = e.snd.rtt.srttInited
e.snd.rtt.Unlock()
if cubic, ok := e.snd.cc.(*cubicState); ok {
s.Sender.Cubic = stack.TCPCubicState{
WMax: cubic.wMax,
WLastMax: cubic.wLastMax,
T: cubic.t,
TimeSinceLastCongestion: time.Since(cubic.t),
C: cubic.c,
K: cubic.k,
Beta: cubic.beta,
WC: cubic.wC,
WEst: cubic.wEst,
}
}
return s
}
func (e *endpoint) initHardwareGSO() {
gso := &stack.GSO{}
switch e.route.NetProto {
case header.IPv4ProtocolNumber:
gso.Type = stack.GSOTCPv4
gso.L3HdrLen = header.IPv4MinimumSize
case header.IPv6ProtocolNumber:
gso.Type = stack.GSOTCPv6
gso.L3HdrLen = header.IPv6MinimumSize
default:
panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto))
}
gso.NeedsCsum = true
gso.CsumOffset = header.TCPChecksumOffset
gso.MaxSize = e.route.GSOMaxSize()
e.gso = gso
}
func (e *endpoint) initGSO() {
if e.route.Capabilities()&stack.CapabilityHardwareGSO != 0 {
e.initHardwareGSO()
} else if e.route.Capabilities()&stack.CapabilitySoftwareGSO != 0 {
e.gso = &stack.GSO{
MaxSize: e.route.GSOMaxSize(),
Type: stack.GSOSW,
NeedsCsum: false,
}
}
}
// State implements tcpip.Endpoint.State. It exports the endpoint's protocol
// state for diagnostics.
func (e *endpoint) State() uint32 {
e.mu.Lock()
defer e.mu.Unlock()
return uint32(e.state)
}
// Info returns a copy of the endpoint info.
func (e *endpoint) Info() tcpip.EndpointInfo {
e.mu.RLock()
// Make a copy of the endpoint info.
ret := e.EndpointInfo
e.mu.RUnlock()
return &ret
}
// Stats returns a pointer to the endpoint stats.
func (e *endpoint) Stats() tcpip.EndpointStats {
return &e.stats
}
func mssForRoute(r *stack.Route) uint16 {
return uint16(r.MTU() - header.TCPMinimumSize)
}