| // Copyright 2018 The gVisor Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package tcp |
| |
| import ( |
| "container/list" |
| "encoding/binary" |
| "fmt" |
| "io" |
| "math" |
| "runtime" |
| "strings" |
| "sync/atomic" |
| "time" |
| |
| "gvisor.dev/gvisor/pkg/rand" |
| "gvisor.dev/gvisor/pkg/sleep" |
| "gvisor.dev/gvisor/pkg/sync" |
| "gvisor.dev/gvisor/pkg/tcpip" |
| "gvisor.dev/gvisor/pkg/tcpip/hash/jenkins" |
| "gvisor.dev/gvisor/pkg/tcpip/header" |
| "gvisor.dev/gvisor/pkg/tcpip/ports" |
| "gvisor.dev/gvisor/pkg/tcpip/seqnum" |
| "gvisor.dev/gvisor/pkg/tcpip/stack" |
| "gvisor.dev/gvisor/pkg/waiter" |
| ) |
| |
| // EndpointState represents the state of a TCP endpoint. |
| type EndpointState uint32 |
| |
| // Endpoint states. Note that are represented in a netstack-specific manner and |
| // may not be meaningful externally. Specifically, they need to be translated to |
| // Linux's representation for these states if presented to userspace. |
| const ( |
| // Endpoint states internal to netstack. These map to the TCP state CLOSED. |
| StateInitial EndpointState = iota |
| StateBound |
| StateConnecting // Connect() called, but the initial SYN hasn't been sent. |
| StateError |
| |
| // TCP protocol states. |
| StateEstablished |
| StateSynSent |
| StateSynRecv |
| StateFinWait1 |
| StateFinWait2 |
| StateTimeWait |
| StateClose |
| StateCloseWait |
| StateLastAck |
| StateListen |
| StateClosing |
| ) |
| |
| const ( |
| // rcvAdvWndScale is used to split the available socket buffer into |
| // application buffer and the window to be advertised to the peer. This is |
| // currently hard coded to split the available space equally. |
| rcvAdvWndScale = 1 |
| |
| // SegOverheadFactor is used to multiply the value provided by the |
| // user on a SetSockOpt for setting the socket send/receive buffer sizes. |
| SegOverheadFactor = 2 |
| ) |
| |
| // connected returns true when s is one of the states representing an |
| // endpoint connected to a peer. |
| func (s EndpointState) connected() bool { |
| switch s { |
| case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing: |
| return true |
| default: |
| return false |
| } |
| } |
| |
| // connecting returns true when s is one of the states representing a |
| // connection in progress, but not yet fully established. |
| func (s EndpointState) connecting() bool { |
| switch s { |
| case StateConnecting, StateSynSent, StateSynRecv: |
| return true |
| default: |
| return false |
| } |
| } |
| |
| // handshake returns true when s is one of the states representing an endpoint |
| // in the middle of a TCP handshake. |
| func (s EndpointState) handshake() bool { |
| switch s { |
| case StateSynSent, StateSynRecv: |
| return true |
| default: |
| return false |
| } |
| } |
| |
| // closed returns true when s is one of the states an endpoint transitions to |
| // when closed or when it encounters an error. This is distinct from a newly |
| // initialized endpoint that was never connected. |
| func (s EndpointState) closed() bool { |
| switch s { |
| case StateClose, StateError: |
| return true |
| default: |
| return false |
| } |
| } |
| |
| // String implements fmt.Stringer.String. |
| func (s EndpointState) String() string { |
| switch s { |
| case StateInitial: |
| return "INITIAL" |
| case StateBound: |
| return "BOUND" |
| case StateConnecting: |
| return "CONNECTING" |
| case StateError: |
| return "ERROR" |
| case StateEstablished: |
| return "ESTABLISHED" |
| case StateSynSent: |
| return "SYN-SENT" |
| case StateSynRecv: |
| return "SYN-RCVD" |
| case StateFinWait1: |
| return "FIN-WAIT1" |
| case StateFinWait2: |
| return "FIN-WAIT2" |
| case StateTimeWait: |
| return "TIME-WAIT" |
| case StateClose: |
| return "CLOSED" |
| case StateCloseWait: |
| return "CLOSE-WAIT" |
| case StateLastAck: |
| return "LAST-ACK" |
| case StateListen: |
| return "LISTEN" |
| case StateClosing: |
| return "CLOSING" |
| default: |
| panic("unreachable") |
| } |
| } |
| |
| // Reasons for notifying the protocol goroutine. |
| const ( |
| notifyNonZeroReceiveWindow = 1 << iota |
| notifyClose |
| notifyMTUChanged |
| notifyDrain |
| notifyReset |
| notifyResetByPeer |
| // notifyAbort is a request for an expedited teardown. |
| notifyAbort |
| notifyKeepaliveChanged |
| notifyMSSChanged |
| // notifyTickleWorker is used to tickle the protocol main loop during a |
| // restore after we update the endpoint state to the correct one. This |
| // ensures the loop terminates if the final state of the endpoint is |
| // say TIME_WAIT. |
| notifyTickleWorker |
| notifyError |
| ) |
| |
| // SACKInfo holds TCP SACK related information for a given endpoint. |
| // |
| // +stateify savable |
| type SACKInfo struct { |
| // Blocks is the maximum number of SACK blocks we track |
| // per endpoint. |
| Blocks [MaxSACKBlocks]header.SACKBlock |
| |
| // NumBlocks is the number of valid SACK blocks stored in the |
| // blocks array above. |
| NumBlocks int |
| } |
| |
| // rcvBufAutoTuneParams are used to hold state variables to compute |
| // the auto tuned recv buffer size. |
| // |
| // +stateify savable |
| type rcvBufAutoTuneParams struct { |
| // measureTime is the time at which the current measurement |
| // was started. |
| measureTime time.Time `state:".(unixTime)"` |
| |
| // copied is the number of bytes copied out of the receive |
| // buffers since this measure began. |
| copied int |
| |
| // prevCopied is the number of bytes copied out of the receive |
| // buffers in the previous RTT period. |
| prevCopied int |
| |
| // rtt is the non-smoothed minimum RTT as measured by observing the time |
| // between when a byte is first acknowledged and the receipt of data |
| // that is at least one window beyond the sequence number that was |
| // acknowledged. |
| rtt time.Duration |
| |
| // rttMeasureSeqNumber is the highest acceptable sequence number at the |
| // time this RTT measurement period began. |
| rttMeasureSeqNumber seqnum.Value |
| |
| // rttMeasureTime is the absolute time at which the current rtt |
| // measurement period began. |
| rttMeasureTime time.Time `state:".(unixTime)"` |
| |
| // disabled is true if an explicit receive buffer is set for the |
| // endpoint. |
| disabled bool |
| } |
| |
| // ReceiveErrors collect segment receive errors within transport layer. |
| type ReceiveErrors struct { |
| tcpip.ReceiveErrors |
| |
| // SegmentQueueDropped is the number of segments dropped due to |
| // a full segment queue. |
| SegmentQueueDropped tcpip.StatCounter |
| |
| // ChecksumErrors is the number of segments dropped due to bad checksums. |
| ChecksumErrors tcpip.StatCounter |
| |
| // ListenOverflowSynDrop is the number of times the listen queue overflowed |
| // and a SYN was dropped. |
| ListenOverflowSynDrop tcpip.StatCounter |
| |
| // ListenOverflowAckDrop is the number of times the final ACK |
| // in the handshake was dropped due to overflow. |
| ListenOverflowAckDrop tcpip.StatCounter |
| |
| // ZeroRcvWindowState is the number of times we advertised |
| // a zero receive window when rcvList is full. |
| ZeroRcvWindowState tcpip.StatCounter |
| |
| // WantZeroWindow is the number of times we wanted to advertise a |
| // zero receive window but couldn't because it would have caused |
| // the receive window's right edge to shrink. |
| WantZeroRcvWindow tcpip.StatCounter |
| } |
| |
| // SendErrors collect segment send errors within the transport layer. |
| type SendErrors struct { |
| tcpip.SendErrors |
| |
| // SegmentSendToNetworkFailed is the number of TCP segments failed to be sent |
| // to the network endpoint. |
| SegmentSendToNetworkFailed tcpip.StatCounter |
| |
| // SynSendToNetworkFailed is the number of TCP SYNs failed to be sent |
| // to the network endpoint. |
| SynSendToNetworkFailed tcpip.StatCounter |
| |
| // Retransmits is the number of TCP segments retransmitted. |
| Retransmits tcpip.StatCounter |
| |
| // FastRetransmit is the number of segments retransmitted in fast |
| // recovery. |
| FastRetransmit tcpip.StatCounter |
| |
| // Timeouts is the number of times the RTO expired. |
| Timeouts tcpip.StatCounter |
| } |
| |
| // Stats holds statistics about the endpoint. |
| type Stats struct { |
| // SegmentsReceived is the number of TCP segments received that |
| // the transport layer successfully parsed. |
| SegmentsReceived tcpip.StatCounter |
| |
| // SegmentsSent is the number of TCP segments sent. |
| SegmentsSent tcpip.StatCounter |
| |
| // FailedConnectionAttempts is the number of times we saw Connect and |
| // Accept errors. |
| FailedConnectionAttempts tcpip.StatCounter |
| |
| // ReceiveErrors collects segment receive errors within the |
| // transport layer. |
| ReceiveErrors ReceiveErrors |
| |
| // ReadErrors collects segment read errors from an endpoint read call. |
| ReadErrors tcpip.ReadErrors |
| |
| // SendErrors collects segment send errors within the transport layer. |
| SendErrors SendErrors |
| |
| // WriteErrors collects segment write errors from an endpoint write call. |
| WriteErrors tcpip.WriteErrors |
| } |
| |
| // IsEndpointStats is an empty method to implement the tcpip.EndpointStats |
| // marker interface. |
| func (*Stats) IsEndpointStats() {} |
| |
| // EndpointInfo holds useful information about a transport endpoint which |
| // can be queried by monitoring tools. This exists to allow tcp-only state to |
| // be exposed. |
| // |
| // +stateify savable |
| type EndpointInfo struct { |
| stack.TransportEndpointInfo |
| } |
| |
| // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo |
| // marker interface. |
| func (*EndpointInfo) IsEndpointInfo() {} |
| |
| // +stateify savable |
| type accepted struct { |
| // NB: this could be an endpointList, but ilist only permits endpoints to |
| // belong to one list at a time, and endpoints are already stored in the |
| // dispatcher's list. |
| endpoints list.List `state:".([]*endpoint)"` |
| cap int |
| } |
| |
| // endpoint represents a TCP endpoint. This struct serves as the interface |
| // between users of the endpoint and the protocol implementation; it is legal to |
| // have concurrent goroutines make calls into the endpoint, they are properly |
| // synchronized. The protocol implementation, however, runs in a single |
| // goroutine. |
| // |
| // Each endpoint has a few mutexes: |
| // |
| // e.mu -> Primary mutex for an endpoint must be held for all operations except |
| // in e.Readiness where acquiring it will result in a deadlock in epoll |
| // implementation. |
| // |
| // The following three mutexes can be acquired independent of e.mu but if |
| // acquired with e.mu then e.mu must be acquired first. |
| // |
| // e.acceptMu -> protects accepted. |
| // e.rcvListMu -> Protects the rcvList and associated fields. |
| // e.sndBufMu -> Protects the sndQueue and associated fields. |
| // e.lastErrorMu -> Protects the lastError field. |
| // |
| // LOCKING/UNLOCKING of the endpoint. The locking of an endpoint is different |
| // based on the context in which the lock is acquired. In the syscall context |
| // e.LockUser/e.UnlockUser should be used and when doing background processing |
| // e.mu.Lock/e.mu.Unlock should be used. The distinction is described below |
| // in brief. |
| // |
| // The reason for this locking behaviour is to avoid wakeups to handle packets. |
| // In cases where the endpoint is already locked the background processor can |
| // queue the packet up and go its merry way and the lock owner will eventually |
| // process the backlog when releasing the lock. Similarly when acquiring the |
| // lock from say a syscall goroutine we can implement a bit of spinning if we |
| // know that the lock is not held by another syscall goroutine. Background |
| // processors should never hold the lock for long and we can avoid an expensive |
| // sleep/wakeup by spinning for a shortwhile. |
| // |
| // For more details please see the detailed documentation on |
| // e.LockUser/e.UnlockUser methods. |
| // |
| // +stateify savable |
| type endpoint struct { |
| EndpointInfo |
| tcpip.DefaultSocketOptionsHandler |
| |
| // endpointEntry is used to queue endpoints for processing to the |
| // a given tcp processor goroutine. |
| // |
| // Precondition: epQueue.mu must be held to read/write this field.. |
| endpointEntry `state:"nosave"` |
| |
| // pendingProcessing is true if this endpoint is queued for processing |
| // to a TCP processor. |
| // |
| // Precondition: epQueue.mu must be held to read/write this field.. |
| pendingProcessing bool `state:"nosave"` |
| |
| // The following fields are initialized at creation time and do not |
| // change throughout the lifetime of the endpoint. |
| stack *stack.Stack `state:"manual"` |
| waiterQueue *waiter.Queue `state:"wait"` |
| uniqueID uint64 |
| |
| // hardError is meaningful only when state is stateError. It stores the |
| // error to be returned when read/write syscalls are called and the |
| // endpoint is in this state. hardError is protected by endpoint mu. |
| hardError tcpip.Error |
| |
| // lastError represents the last error that the endpoint reported; |
| // access to it is protected by the following mutex. |
| lastErrorMu sync.Mutex `state:"nosave"` |
| lastError tcpip.Error |
| |
| // rcvReadMu synchronizes calls to Read. |
| // |
| // mu and rcvListMu are temporarily released during data copying. rcvReadMu |
| // must be held during each read to ensure atomicity, so that multiple reads |
| // do not interleave. |
| // |
| // rcvReadMu should be held before holding mu. |
| rcvReadMu sync.Mutex `state:"nosave"` |
| |
| // rcvListMu synchronizes access to rcvList. |
| // |
| // rcvListMu can be taken after the endpoint mu below. |
| rcvListMu sync.Mutex `state:"nosave"` |
| |
| // rcvList is the queue for ready-for-delivery segments. |
| // |
| // rcvReadMu, mu and rcvListMu must be held, in the stated order, to read data |
| // and removing segments from list. A range of segment can be determined, then |
| // temporarily release mu and rcvListMu while processing the segment range. |
| // This allows new segments to be appended to the list while processing. |
| // |
| // rcvListMu must be held to append segments to list. |
| rcvList segmentList `state:"wait"` |
| rcvClosed bool |
| // rcvBufSize is the total size of the receive buffer. |
| rcvBufSize int |
| // rcvBufUsed is the actual number of payload bytes held in the receive buffer |
| // not counting any overheads of the segments itself. NOTE: This will always |
| // be strictly <= rcvMemUsed below. |
| rcvBufUsed int |
| rcvAutoParams rcvBufAutoTuneParams |
| |
| // rcvMemUsed tracks the total amount of memory in use by received segments |
| // held in rcvList, pendingRcvdSegments and the segment queue. This is used to |
| // compute the window and the actual available buffer space. This is distinct |
| // from rcvBufUsed above which is the actual number of payload bytes held in |
| // the buffer not including any segment overheads. |
| // |
| // rcvMemUsed must be accessed atomically. |
| rcvMemUsed int32 |
| |
| // mu protects all endpoint fields unless documented otherwise. mu must |
| // be acquired before interacting with the endpoint fields. |
| // |
| // During handshake, mu is locked by the protocol listen goroutine and |
| // released by the handshake completion goroutine. |
| mu sync.CrossGoroutineMutex `state:"nosave"` |
| ownedByUser uint32 |
| |
| // state must be read/set using the EndpointState()/setEndpointState() |
| // methods. |
| state EndpointState `state:".(EndpointState)"` |
| |
| // origEndpointState is only used during a restore phase to save the |
| // endpoint state at restore time as the socket is moved to it's correct |
| // state. |
| origEndpointState EndpointState `state:"nosave"` |
| |
| isPortReserved bool `state:"manual"` |
| isRegistered bool `state:"manual"` |
| boundNICID tcpip.NICID |
| route *stack.Route `state:"manual"` |
| ttl uint8 |
| isConnectNotified bool |
| |
| // h stores a reference to the current handshake state if the endpoint is in |
| // the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep. |
| // nil otherwise. |
| h *handshake `state:"nosave"` |
| |
| // portFlags stores the current values of port related flags. |
| portFlags ports.Flags |
| |
| // Values used to reserve a port or register a transport endpoint |
| // (which ever happens first). |
| boundBindToDevice tcpip.NICID |
| boundPortFlags ports.Flags |
| boundDest tcpip.FullAddress |
| |
| // effectiveNetProtos contains the network protocols actually in use. In |
| // most cases it will only contain "netProto", but in cases like IPv6 |
| // endpoints with v6only set to false, this could include multiple |
| // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g., |
| // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped |
| // address). |
| effectiveNetProtos []tcpip.NetworkProtocolNumber |
| |
| // workerRunning specifies if a worker goroutine is running. |
| workerRunning bool |
| |
| // workerCleanup specifies if the worker goroutine must perform cleanup |
| // before exiting. This can only be set to true when workerRunning is |
| // also true, and they're both protected by the mutex. |
| workerCleanup bool |
| |
| // sendTSOk is used to indicate when the TS Option has been negotiated. |
| // When sendTSOk is true every non-RST segment should carry a TS as per |
| // RFC7323#section-1.1 |
| sendTSOk bool |
| |
| // recentTS is the timestamp that should be sent in the TSEcr field of |
| // the timestamp for future segments sent by the endpoint. This field is |
| // updated if required when a new segment is received by this endpoint. |
| recentTS uint32 |
| |
| // recentTSTime is the unix time when we updated recentTS last. |
| recentTSTime time.Time `state:".(unixTime)"` |
| |
| // tsOffset is a randomized offset added to the value of the |
| // TSVal field in the timestamp option. |
| tsOffset uint32 |
| |
| // shutdownFlags represent the current shutdown state of the endpoint. |
| shutdownFlags tcpip.ShutdownFlags |
| |
| // tcpRecovery is the loss deteoction algorithm used by TCP. |
| tcpRecovery tcpip.TCPRecovery |
| |
| // sackPermitted is set to true if the peer sends the TCPSACKPermitted |
| // option in the SYN/SYN-ACK. |
| sackPermitted bool |
| |
| // sack holds TCP SACK related information for this endpoint. |
| sack SACKInfo |
| |
| // delay enables Nagle's algorithm. |
| // |
| // delay is a boolean (0 is false) and must be accessed atomically. |
| delay uint32 |
| |
| // scoreboard holds TCP SACK Scoreboard information for this endpoint. |
| scoreboard *SACKScoreboard |
| |
| // segmentQueue is used to hand received segments to the protocol |
| // goroutine. Segments are queued as long as the queue is not full, |
| // and dropped when it is. |
| segmentQueue segmentQueue `state:"wait"` |
| |
| // synRcvdCount is the number of connections for this endpoint that are |
| // in SYN-RCVD state; this is only accessed atomically. |
| synRcvdCount int32 |
| |
| // userMSS if non-zero is the MSS value explicitly set by the user |
| // for this endpoint using the TCP_MAXSEG setsockopt. |
| userMSS uint16 |
| |
| // maxSynRetries is the maximum number of SYN retransmits that TCP should |
| // send before aborting the attempt to connect. It cannot exceed 255. |
| // |
| // NOTE: This is currently a no-op and does not change the SYN |
| // retransmissions. |
| maxSynRetries uint8 |
| |
| // windowClamp is used to bound the size of the advertised window to |
| // this value. |
| windowClamp uint32 |
| |
| // The following fields are used to manage the send buffer. When |
| // segments are ready to be sent, they are added to sndQueue and the |
| // protocol goroutine is signaled via sndWaker. |
| // |
| // When the send side is closed, the protocol goroutine is notified via |
| // sndCloseWaker, and sndClosed is set to true. |
| sndBufMu sync.Mutex `state:"nosave"` |
| sndBufUsed int |
| sndClosed bool |
| sndBufInQueue seqnum.Size |
| sndQueue segmentList `state:"wait"` |
| sndWaker sleep.Waker `state:"manual"` |
| sndCloseWaker sleep.Waker `state:"manual"` |
| |
| // cc stores the name of the Congestion Control algorithm to use for |
| // this endpoint. |
| cc tcpip.CongestionControlOption |
| |
| // The following are used when a "packet too big" control packet is |
| // received. They are protected by sndBufMu. They are used to |
| // communicate to the main protocol goroutine how many such control |
| // messages have been received since the last notification was processed |
| // and what was the smallest MTU seen. |
| packetTooBigCount int |
| sndMTU int |
| |
| // newSegmentWaker is used to indicate to the protocol goroutine that |
| // it needs to wake up and handle new segments queued to it. |
| newSegmentWaker sleep.Waker `state:"manual"` |
| |
| // notificationWaker is used to indicate to the protocol goroutine that |
| // it needs to wake up and check for notifications. |
| notificationWaker sleep.Waker `state:"manual"` |
| |
| // notifyFlags is a bitmask of flags used to indicate to the protocol |
| // goroutine what it was notified; this is only accessed atomically. |
| notifyFlags uint32 `state:"nosave"` |
| |
| // keepalive manages TCP keepalive state. When the connection is idle |
| // (no data sent or received) for keepaliveIdle, we start sending |
| // keepalives every keepalive.interval. If we send keepalive.count |
| // without hearing a response, the connection is closed. |
| keepalive keepalive |
| |
| // userTimeout if non-zero specifies a user specified timeout for |
| // a connection w/ pending data to send. A connection that has pending |
| // unacked data will be forcibily aborted if the timeout is reached |
| // without any data being acked. |
| userTimeout time.Duration |
| |
| // deferAccept if non-zero specifies a user specified time during |
| // which the final ACK of a handshake will be dropped provided the |
| // ACK is a bare ACK and carries no data. If the timeout is crossed then |
| // the bare ACK is accepted and the connection is delivered to the |
| // listener. |
| deferAccept time.Duration |
| |
| // pendingAccepted tracks connections queued to be accepted. It is used to |
| // ensure such queued connections are terminated before the accepted queue is |
| // marked closed (by setting its capacity to zero). |
| pendingAccepted sync.WaitGroup `state:"nosave"` |
| |
| // acceptMu protects accepted. |
| acceptMu sync.Mutex `state:"nosave"` |
| |
| // acceptCond is a condition variable that can be used to block on when |
| // accepted is full and an endpoint is ready to be delivered. |
| // |
| // We use this condition variable to block/unblock goroutines which |
| // tried to deliver an endpoint but couldn't because accept backlog was |
| // full ( See: endpoint.deliverAccepted ). |
| acceptCond *sync.Cond `state:"nosave"` |
| |
| // accepted is used by a listening endpoint protocol goroutine to |
| // send newly accepted connections to the endpoint so that they can be |
| // read by Accept() calls. |
| accepted accepted |
| |
| // The following are only used from the protocol goroutine, and |
| // therefore don't need locks to protect them. |
| rcv *receiver `state:"wait"` |
| snd *sender `state:"wait"` |
| |
| // The goroutine drain completion notification channel. |
| drainDone chan struct{} `state:"nosave"` |
| |
| // The goroutine undrain notification channel. This is currently used as |
| // a way to block the worker goroutines. Today nothing closes/writes |
| // this channel and this causes any goroutines waiting on this to just |
| // block. This is used during save/restore to prevent worker goroutines |
| // from mutating state as it's being saved. |
| undrain chan struct{} `state:"nosave"` |
| |
| // probe if not nil is invoked on every received segment. It is passed |
| // a copy of the current state of the endpoint. |
| probe stack.TCPProbeFunc `state:"nosave"` |
| |
| // The following are only used to assist the restore run to re-connect. |
| connectingAddress tcpip.Address |
| |
| // amss is the advertised MSS to the peer by this endpoint. |
| amss uint16 |
| |
| // sendTOS represents IPv4 TOS or IPv6 TrafficClass, |
| // applied while sending packets. Defaults to 0 as on Linux. |
| sendTOS uint8 |
| |
| gso *stack.GSO |
| |
| // TODO(b/142022063): Add ability to save and restore per endpoint stats. |
| stats Stats `state:"nosave"` |
| |
| // tcpLingerTimeout is the maximum amount of a time a socket |
| // a socket stays in TIME_WAIT state before being marked |
| // closed. |
| tcpLingerTimeout time.Duration |
| |
| // closed indicates that the user has called closed on the |
| // endpoint and at this point the endpoint is only around |
| // to complete the TCP shutdown. |
| closed bool |
| |
| // txHash is the transport layer hash to be set on outbound packets |
| // emitted by this endpoint. |
| txHash uint32 |
| |
| // owner is used to get uid and gid of the packet. |
| owner tcpip.PacketOwner |
| |
| // ops is used to get socket level options. |
| ops tcpip.SocketOptions |
| |
| // lastOutOfWindowAckTime is the time at which the an ACK was sent in response |
| // to an out of window segment being received by this endpoint. |
| lastOutOfWindowAckTime time.Time `state:".(unixTime)"` |
| } |
| |
| // UniqueID implements stack.TransportEndpoint.UniqueID. |
| func (e *endpoint) UniqueID() uint64 { |
| return e.uniqueID |
| } |
| |
| // calculateAdvertisedMSS calculates the MSS to advertise. |
| // |
| // If userMSS is non-zero and is not greater than the maximum possible MSS for |
| // r, it will be used; otherwise, the maximum possible MSS will be used. |
| func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 { |
| // The maximum possible MSS is dependent on the route. |
| // TODO(b/143359391): Respect TCP Min and Max size. |
| maxMSS := uint16(r.MTU() - header.TCPMinimumSize) |
| |
| if userMSS != 0 && userMSS < maxMSS { |
| return userMSS |
| } |
| |
| return maxMSS |
| } |
| |
| // LockUser tries to lock e.mu and if it fails it will check if the lock is held |
| // by another syscall goroutine. If yes, then it will goto sleep waiting for the |
| // lock to be released, if not then it will spin till it acquires the lock or |
| // another syscall goroutine acquires it in which case it will goto sleep as |
| // described above. |
| // |
| // The assumption behind spinning here being that background packet processing |
| // should not be holding the lock for long and spinning reduces latency as we |
| // avoid an expensive sleep/wakeup of of the syscall goroutine). |
| func (e *endpoint) LockUser() { |
| for { |
| // Try first if the sock is locked then check if it's owned |
| // by another user goroutine if not then we spin, otherwise |
| // we just go to sleep on the Lock() and wait. |
| if !e.mu.TryLock() { |
| // If socket is owned by the user then just go to sleep |
| // as the lock could be held for a reasonably long time. |
| if atomic.LoadUint32(&e.ownedByUser) == 1 { |
| e.mu.Lock() |
| atomic.StoreUint32(&e.ownedByUser, 1) |
| return |
| } |
| // Spin but yield the processor since the lower half |
| // should yield the lock soon. |
| runtime.Gosched() |
| continue |
| } |
| atomic.StoreUint32(&e.ownedByUser, 1) |
| return |
| } |
| } |
| |
| // UnlockUser will check if there are any segments already queued for processing |
| // and process any such segments before unlocking e.mu. This is required because |
| // we when packets arrive and endpoint lock is already held then such packets |
| // are queued up to be processed. If the lock is held by the endpoint goroutine |
| // then it will process these packets but if the lock is instead held by the |
| // syscall goroutine then we can have the syscall goroutine process the backlog |
| // before unlocking. |
| // |
| // This avoids an unnecessary wakeup of the endpoint protocol goroutine for the |
| // endpoint. It's also required eventually when we get rid of the endpoint |
| // protocol goroutine altogether. |
| // |
| // Precondition: e.LockUser() must have been called before calling e.UnlockUser() |
| // +checklocks:e.mu |
| func (e *endpoint) UnlockUser() { |
| // Lock segment queue before checking so that we avoid a race where |
| // segments can be queued between the time we check if queue is empty |
| // and actually unlock the endpoint mutex. |
| for { |
| e.segmentQueue.mu.Lock() |
| if e.segmentQueue.emptyLocked() { |
| if atomic.SwapUint32(&e.ownedByUser, 0) != 1 { |
| panic("e.UnlockUser() called without calling e.LockUser()") |
| } |
| e.mu.Unlock() |
| e.segmentQueue.mu.Unlock() |
| return |
| } |
| e.segmentQueue.mu.Unlock() |
| |
| switch e.EndpointState() { |
| case StateEstablished: |
| if err := e.handleSegments(true /* fastPath */); err != nil { |
| e.notifyProtocolGoroutine(notifyTickleWorker) |
| } |
| default: |
| // Since we are waking the endpoint goroutine here just unlock |
| // and let it process the queued segments. |
| e.newSegmentWaker.Assert() |
| if atomic.SwapUint32(&e.ownedByUser, 0) != 1 { |
| panic("e.UnlockUser() called without calling e.LockUser()") |
| } |
| e.mu.Unlock() |
| return |
| } |
| } |
| } |
| |
| // StopWork halts packet processing. Only to be used in tests. |
| func (e *endpoint) StopWork() { |
| e.mu.Lock() |
| } |
| |
| // ResumeWork resumes packet processing. Only to be used in tests. |
| // +checklocks:e.mu |
| func (e *endpoint) ResumeWork() { |
| e.mu.Unlock() |
| } |
| |
| // setEndpointState updates the state of the endpoint to state atomically. This |
| // method is unexported as the only place we should update the state is in this |
| // package but we allow the state to be read freely without holding e.mu. |
| // |
| // Precondition: e.mu must be held to call this method. |
| func (e *endpoint) setEndpointState(state EndpointState) { |
| oldstate := EndpointState(atomic.LoadUint32((*uint32)(&e.state))) |
| switch state { |
| case StateEstablished: |
| e.stack.Stats().TCP.CurrentEstablished.Increment() |
| e.stack.Stats().TCP.CurrentConnected.Increment() |
| case StateError: |
| fallthrough |
| case StateClose: |
| if oldstate == StateCloseWait || oldstate == StateEstablished { |
| e.stack.Stats().TCP.EstablishedResets.Increment() |
| } |
| fallthrough |
| default: |
| if oldstate == StateEstablished { |
| e.stack.Stats().TCP.CurrentEstablished.Decrement() |
| } |
| } |
| atomic.StoreUint32((*uint32)(&e.state), uint32(state)) |
| } |
| |
| // EndpointState returns the current state of the endpoint. |
| func (e *endpoint) EndpointState() EndpointState { |
| return EndpointState(atomic.LoadUint32((*uint32)(&e.state))) |
| } |
| |
| // setRecentTimestamp sets the recentTS field to the provided value. |
| func (e *endpoint) setRecentTimestamp(recentTS uint32) { |
| e.recentTS = recentTS |
| e.recentTSTime = time.Now() |
| } |
| |
| // recentTimestamp returns the value of the recentTS field. |
| func (e *endpoint) recentTimestamp() uint32 { |
| return e.recentTS |
| } |
| |
| // keepalive is a synchronization wrapper used to appease stateify. See the |
| // comment in endpoint, where it is used. |
| // |
| // +stateify savable |
| type keepalive struct { |
| sync.Mutex `state:"nosave"` |
| idle time.Duration |
| interval time.Duration |
| count int |
| unacked int |
| timer timer `state:"nosave"` |
| waker sleep.Waker `state:"nosave"` |
| } |
| |
| func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint { |
| e := &endpoint{ |
| stack: s, |
| EndpointInfo: EndpointInfo{ |
| TransportEndpointInfo: stack.TransportEndpointInfo{ |
| NetProto: netProto, |
| TransProto: header.TCPProtocolNumber, |
| }, |
| }, |
| waiterQueue: waiterQueue, |
| state: StateInitial, |
| rcvBufSize: DefaultReceiveBufferSize, |
| sndMTU: math.MaxInt32, |
| keepalive: keepalive{ |
| // Linux defaults. |
| idle: 2 * time.Hour, |
| interval: 75 * time.Second, |
| count: 9, |
| }, |
| uniqueID: s.UniqueID(), |
| txHash: s.Rand().Uint32(), |
| windowClamp: DefaultReceiveBufferSize, |
| maxSynRetries: DefaultSynRetries, |
| } |
| e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits) |
| e.ops.SetMulticastLoop(true) |
| e.ops.SetQuickAck(true) |
| e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */) |
| |
| var ss tcpip.TCPSendBufferSizeRangeOption |
| if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil { |
| e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) |
| } |
| |
| var rs tcpip.TCPReceiveBufferSizeRangeOption |
| if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil { |
| e.rcvBufSize = rs.Default |
| } |
| |
| var cs tcpip.CongestionControlOption |
| if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil { |
| e.cc = cs |
| } |
| |
| var mrb tcpip.TCPModerateReceiveBufferOption |
| if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil { |
| e.rcvAutoParams.disabled = !bool(mrb) |
| } |
| |
| var de tcpip.TCPDelayEnabled |
| if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de { |
| e.ops.SetDelayOption(true) |
| } |
| |
| var tcpLT tcpip.TCPLingerTimeoutOption |
| if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil { |
| e.tcpLingerTimeout = time.Duration(tcpLT) |
| } |
| |
| var synRetries tcpip.TCPSynRetriesOption |
| if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil { |
| e.maxSynRetries = uint8(synRetries) |
| } |
| |
| s.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery) |
| |
| if p := s.GetTCPProbe(); p != nil { |
| e.probe = p |
| } |
| |
| e.segmentQueue.ep = e |
| e.tsOffset = timeStampOffset() |
| e.acceptCond = sync.NewCond(&e.acceptMu) |
| e.keepalive.timer.init(&e.keepalive.waker) |
| |
| return e |
| } |
| |
| // Readiness returns the current readiness of the endpoint. For example, if |
| // waiter.EventIn is set, the endpoint is immediately readable. |
| func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { |
| result := waiter.EventMask(0) |
| |
| switch e.EndpointState() { |
| case StateInitial, StateBound: |
| // This prevents blocking of new sockets which are not |
| // connected when SO_LINGER is set. |
| result |= waiter.EventHUp |
| |
| case StateConnecting, StateSynSent, StateSynRecv: |
| // Ready for nothing. |
| |
| case StateClose, StateError, StateTimeWait: |
| // Ready for anything. |
| result = mask |
| |
| case StateListen: |
| // Check if there's anything in the accepted channel. |
| if (mask & waiter.ReadableEvents) != 0 { |
| e.acceptMu.Lock() |
| if e.accepted.endpoints.Len() != 0 { |
| result |= waiter.ReadableEvents |
| } |
| e.acceptMu.Unlock() |
| } |
| } |
| if e.EndpointState().connected() { |
| // Determine if the endpoint is writable if requested. |
| if (mask & waiter.WritableEvents) != 0 { |
| e.sndBufMu.Lock() |
| sndBufSize := e.getSendBufferSize() |
| if e.sndClosed || e.sndBufUsed < sndBufSize { |
| result |= waiter.WritableEvents |
| } |
| e.sndBufMu.Unlock() |
| } |
| |
| // Determine if the endpoint is readable if requested. |
| if (mask & waiter.ReadableEvents) != 0 { |
| e.rcvListMu.Lock() |
| if e.rcvBufUsed > 0 || e.rcvClosed { |
| result |= waiter.ReadableEvents |
| } |
| e.rcvListMu.Unlock() |
| } |
| } |
| |
| return result |
| } |
| |
| func (e *endpoint) fetchNotifications() uint32 { |
| return atomic.SwapUint32(&e.notifyFlags, 0) |
| } |
| |
| func (e *endpoint) notifyProtocolGoroutine(n uint32) { |
| for { |
| v := atomic.LoadUint32(&e.notifyFlags) |
| if v&n == n { |
| // The flags are already set. |
| return |
| } |
| |
| if atomic.CompareAndSwapUint32(&e.notifyFlags, v, v|n) { |
| if v == 0 { |
| // We are causing a transition from no flags to |
| // at least one flag set, so we must cause the |
| // protocol goroutine to wake up. |
| e.notificationWaker.Assert() |
| } |
| return |
| } |
| } |
| } |
| |
| // Abort implements stack.TransportEndpoint.Abort. |
| func (e *endpoint) Abort() { |
| // The abort notification is not processed synchronously, so no |
| // synchronization is needed. |
| // |
| // If the endpoint becomes connected after this check, we still close |
| // the endpoint. This worst case results in a slower abort. |
| // |
| // If the endpoint disconnected after the check, nothing needs to be |
| // done, so sending a notification which will potentially be ignored is |
| // fine. |
| // |
| // If the endpoint connecting finishes after the check, the endpoint |
| // is either in a connected state (where we would notifyAbort anyway), |
| // SYN-RECV (where we would also notifyAbort anyway), or in an error |
| // state where nothing is required and the notification can be safely |
| // ignored. |
| // |
| // Endpoints where a Close during connecting or SYN-RECV state would be |
| // problematic are set to state connecting before being registered (and |
| // thus possible to be Aborted). They are never available in initial |
| // state. |
| // |
| // Endpoints transitioning from initial to connecting state may be |
| // safely either closed or sent notifyAbort. |
| if s := e.EndpointState(); s == StateConnecting || s == StateSynRecv || s.connected() { |
| e.notifyProtocolGoroutine(notifyAbort) |
| return |
| } |
| e.Close() |
| } |
| |
| // Close puts the endpoint in a closed state and frees all resources associated |
| // with it. It must be called only once and with no other concurrent calls to |
| // the endpoint. |
| func (e *endpoint) Close() { |
| e.LockUser() |
| defer e.UnlockUser() |
| if e.closed { |
| return |
| } |
| |
| linger := e.SocketOptions().GetLinger() |
| if linger.Enabled && linger.Timeout == 0 { |
| s := e.EndpointState() |
| isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv |
| if isResetState { |
| // Close the endpoint without doing full shutdown and |
| // send a RST. |
| e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) |
| e.closeNoShutdownLocked() |
| |
| // Wake up worker to close the endpoint. |
| switch s { |
| case StateSynRecv: |
| e.notifyProtocolGoroutine(notifyClose) |
| default: |
| e.notifyProtocolGoroutine(notifyTickleWorker) |
| } |
| return |
| } |
| } |
| |
| // Issue a shutdown so that the peer knows we won't send any more data |
| // if we're connected, or stop accepting if we're listening. |
| e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead) |
| e.closeNoShutdownLocked() |
| } |
| |
| // closeNoShutdown closes the endpoint without doing a full shutdown. |
| func (e *endpoint) closeNoShutdownLocked() { |
| // For listening sockets, we always release ports inline so that they |
| // are immediately available for reuse after Close() is called. If also |
| // registered, we unregister as well otherwise the next user would fail |
| // in Listen() when trying to register. |
| if e.EndpointState() == StateListen && e.isPortReserved { |
| if e.isRegistered { |
| e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice) |
| e.isRegistered = false |
| } |
| |
| portRes := ports.Reservation{ |
| Networks: e.effectiveNetProtos, |
| Transport: ProtocolNumber, |
| Addr: e.ID.LocalAddress, |
| Port: e.ID.LocalPort, |
| Flags: e.boundPortFlags, |
| BindToDevice: e.boundBindToDevice, |
| Dest: e.boundDest, |
| } |
| e.stack.ReleasePort(portRes) |
| e.isPortReserved = false |
| e.boundBindToDevice = 0 |
| e.boundPortFlags = ports.Flags{} |
| e.boundDest = tcpip.FullAddress{} |
| } |
| |
| // Mark endpoint as closed. |
| e.closed = true |
| |
| switch e.EndpointState() { |
| case StateClose, StateError: |
| return |
| } |
| |
| eventMask := waiter.ReadableEvents | waiter.WritableEvents |
| // Either perform the local cleanup or kick the worker to make sure it |
| // knows it needs to cleanup. |
| if e.workerRunning { |
| e.workerCleanup = true |
| tcpip.AddDanglingEndpoint(e) |
| // Worker will remove the dangling endpoint when the endpoint |
| // goroutine terminates. |
| e.notifyProtocolGoroutine(notifyClose) |
| } else { |
| e.transitionToStateCloseLocked() |
| // Notify that the endpoint is closed. |
| eventMask |= waiter.EventHUp |
| } |
| |
| // The TCP closing state-machine would eventually notify EventHUp, but we |
| // notify EventIn|EventOut immediately to unblock any blocked waiters. |
| e.waiterQueue.Notify(eventMask) |
| } |
| |
| // closePendingAcceptableConnections closes all connections that have completed |
| // handshake but not yet been delivered to the application. |
| func (e *endpoint) closePendingAcceptableConnectionsLocked() { |
| e.acceptMu.Lock() |
| acceptedCopy := e.accepted |
| e.accepted = accepted{} |
| e.acceptMu.Unlock() |
| |
| if acceptedCopy == (accepted{}) { |
| return |
| } |
| |
| e.acceptCond.Broadcast() |
| |
| // Reset all connections that are waiting to be accepted. |
| for n := acceptedCopy.endpoints.Front(); n != nil; n = n.Next() { |
| n.Value.(*endpoint).notifyProtocolGoroutine(notifyReset) |
| } |
| // Wait for reset of all endpoints that are still waiting to be delivered to |
| // the now closed accepted. |
| e.pendingAccepted.Wait() |
| } |
| |
| // cleanupLocked frees all resources associated with the endpoint. It is called |
| // after Close() is called and the worker goroutine (if any) is done with its |
| // work. |
| func (e *endpoint) cleanupLocked() { |
| // Close all endpoints that might have been accepted by TCP but not by |
| // the client. |
| e.closePendingAcceptableConnectionsLocked() |
| e.keepalive.timer.cleanup() |
| |
| e.workerCleanup = false |
| |
| if e.isRegistered { |
| e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice) |
| e.isRegistered = false |
| } |
| |
| if e.isPortReserved { |
| portRes := ports.Reservation{ |
| Networks: e.effectiveNetProtos, |
| Transport: ProtocolNumber, |
| Addr: e.ID.LocalAddress, |
| Port: e.ID.LocalPort, |
| Flags: e.boundPortFlags, |
| BindToDevice: e.boundBindToDevice, |
| Dest: e.boundDest, |
| } |
| e.stack.ReleasePort(portRes) |
| e.isPortReserved = false |
| } |
| e.boundBindToDevice = 0 |
| e.boundPortFlags = ports.Flags{} |
| e.boundDest = tcpip.FullAddress{} |
| |
| if e.route != nil { |
| e.route.Release() |
| e.route = nil |
| } |
| |
| e.stack.CompleteTransportEndpointCleanup(e) |
| tcpip.DeleteDanglingEndpoint(e) |
| } |
| |
| // wndFromSpace returns the window that we can advertise based on the available |
| // receive buffer space. |
| func wndFromSpace(space int) int { |
| return space >> rcvAdvWndScale |
| } |
| |
| // initialReceiveWindow returns the initial receive window to advertise in the |
| // SYN/SYN-ACK. |
| func (e *endpoint) initialReceiveWindow() int { |
| rcvWnd := wndFromSpace(e.receiveBufferAvailable()) |
| if rcvWnd > math.MaxUint16 { |
| rcvWnd = math.MaxUint16 |
| } |
| |
| // Use the user supplied MSS, if available. |
| routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2 |
| if rcvWnd > routeWnd { |
| rcvWnd = routeWnd |
| } |
| rcvWndScale := e.rcvWndScaleForHandshake() |
| |
| // Round-down the rcvWnd to a multiple of wndScale. This ensures that the |
| // window offered in SYN won't be reduced due to the loss of precision if |
| // window scaling is enabled after the handshake. |
| rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale) |
| |
| // Ensure we can always accept at least 1 byte if the scale specified |
| // was too high for the provided rcvWnd. |
| if rcvWnd == 0 { |
| rcvWnd = 1 |
| } |
| |
| return rcvWnd |
| } |
| |
| // ModerateRecvBuf adjusts the receive buffer and the advertised window |
| // based on the number of bytes copied to userspace. |
| func (e *endpoint) ModerateRecvBuf(copied int) { |
| e.LockUser() |
| defer e.UnlockUser() |
| |
| e.rcvListMu.Lock() |
| if e.rcvAutoParams.disabled { |
| e.rcvListMu.Unlock() |
| return |
| } |
| now := time.Now() |
| if rtt := e.rcvAutoParams.rtt; rtt == 0 || now.Sub(e.rcvAutoParams.measureTime) < rtt { |
| e.rcvAutoParams.copied += copied |
| e.rcvListMu.Unlock() |
| return |
| } |
| prevRTTCopied := e.rcvAutoParams.copied + copied |
| prevCopied := e.rcvAutoParams.prevCopied |
| rcvWnd := 0 |
| if prevRTTCopied > prevCopied { |
| // The minimal receive window based on what was copied by the app |
| // in the immediate preceding RTT and some extra buffer for 16 |
| // segments to account for variations. |
| // We multiply by 2 to account for packet losses. |
| rcvWnd = prevRTTCopied*2 + 16*int(e.amss) |
| |
| // Scale for slow start based on bytes copied in this RTT vs previous. |
| grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied |
| |
| // Multiply growth factor by 2 again to account for sender being |
| // in slow-start where the sender grows it's congestion window |
| // by 100% per RTT. |
| rcvWnd += grow * 2 |
| |
| // Make sure auto tuned buffer size can always receive upto 2x |
| // the initial window of 10 segments. |
| if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd { |
| rcvWnd = minRcvWnd |
| } |
| |
| // Cap the auto tuned buffer size by the maximum permissible |
| // receive buffer size. |
| if max := e.maxReceiveBufferSize(); rcvWnd > max { |
| rcvWnd = max |
| } |
| |
| // We do not adjust downwards as that can cause the receiver to |
| // reject valid data that might already be in flight as the |
| // acceptable window will shrink. |
| if rcvWnd > e.rcvBufSize { |
| availBefore := wndFromSpace(e.receiveBufferAvailableLocked()) |
| e.rcvBufSize = rcvWnd |
| availAfter := wndFromSpace(e.receiveBufferAvailableLocked()) |
| if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above { |
| e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow) |
| } |
| } |
| |
| // We only update prevCopied when we grow the buffer because in cases |
| // where prevCopied > prevRTTCopied the existing buffer is already big |
| // enough to handle the current rate and we don't need to do any |
| // adjustments. |
| e.rcvAutoParams.prevCopied = prevRTTCopied |
| } |
| e.rcvAutoParams.measureTime = now |
| e.rcvAutoParams.copied = 0 |
| e.rcvListMu.Unlock() |
| } |
| |
| // SetOwner implements tcpip.Endpoint.SetOwner. |
| func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { |
| e.owner = owner |
| } |
| |
| // Preconditions: e.mu must be held to call this function. |
| func (e *endpoint) hardErrorLocked() tcpip.Error { |
| err := e.hardError |
| e.hardError = nil |
| return err |
| } |
| |
| // Preconditions: e.mu must be held to call this function. |
| func (e *endpoint) lastErrorLocked() tcpip.Error { |
| e.lastErrorMu.Lock() |
| defer e.lastErrorMu.Unlock() |
| err := e.lastError |
| e.lastError = nil |
| return err |
| } |
| |
| // LastError implements tcpip.Endpoint.LastError. |
| func (e *endpoint) LastError() tcpip.Error { |
| e.LockUser() |
| defer e.UnlockUser() |
| if err := e.hardErrorLocked(); err != nil { |
| return err |
| } |
| return e.lastErrorLocked() |
| } |
| |
| // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError. |
| func (e *endpoint) UpdateLastError(err tcpip.Error) { |
| e.LockUser() |
| e.lastErrorMu.Lock() |
| e.lastError = err |
| e.lastErrorMu.Unlock() |
| e.UnlockUser() |
| } |
| |
| // Read implements tcpip.Endpoint.Read. |
| func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { |
| e.rcvReadMu.Lock() |
| defer e.rcvReadMu.Unlock() |
| |
| // N.B. Here we get a range of segments to be processed. It is safe to not |
| // hold rcvListMu when processing, since we hold rcvReadMu to ensure only we |
| // can remove segments from the list through commitRead(). |
| first, last, serr := e.startRead() |
| if serr != nil { |
| if _, ok := serr.(*tcpip.ErrClosedForReceive); ok { |
| e.stats.ReadErrors.ReadClosed.Increment() |
| } |
| return tcpip.ReadResult{}, serr |
| } |
| |
| var err error |
| done := 0 |
| s := first |
| for s != nil { |
| var n int |
| n, err = s.data.ReadTo(dst, opts.Peek) |
| // Book keeping first then error handling. |
| |
| done += n |
| |
| if opts.Peek { |
| // For peek, we use the (first, last) range of segment returned from |
| // startRead. We don't consume the receive buffer, so commitRead should |
| // not be called. |
| // |
| // N.B. It is important to use `last` to determine the last segment, since |
| // appending can happen while we process, and will lead to data race. |
| if s == last { |
| break |
| } |
| s = s.Next() |
| } else { |
| // N.B. commitRead() conveniently returns the next segment to read, after |
| // removing the data/segment that is read. |
| s = e.commitRead(n) |
| } |
| |
| if err != nil { |
| break |
| } |
| } |
| |
| // If something is read, we must report it. Report error when nothing is read. |
| if done == 0 && err != nil { |
| return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{} |
| } |
| return tcpip.ReadResult{ |
| Count: done, |
| Total: done, |
| }, nil |
| } |
| |
| // startRead checks that endpoint is in a readable state, and return the |
| // inclusive range of segments that can be read. |
| // |
| // Precondition: e.rcvReadMu must be held. |
| func (e *endpoint) startRead() (first, last *segment, err tcpip.Error) { |
| e.LockUser() |
| defer e.UnlockUser() |
| |
| // When in SYN-SENT state, let the caller block on the receive. |
| // An application can initiate a non-blocking connect and then block |
| // on a receive. It can expect to read any data after the handshake |
| // is complete. RFC793, section 3.9, p58. |
| if e.EndpointState() == StateSynSent { |
| return nil, nil, &tcpip.ErrWouldBlock{} |
| } |
| |
| // The endpoint can be read if it's connected, or if it's already closed |
| // but has some pending unread data. Also note that a RST being received |
| // would cause the state to become StateError so we should allow the |
| // reads to proceed before returning a ECONNRESET. |
| e.rcvListMu.Lock() |
| defer e.rcvListMu.Unlock() |
| |
| bufUsed := e.rcvBufUsed |
| if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 { |
| if s == StateError { |
| if err := e.hardErrorLocked(); err != nil { |
| return nil, nil, err |
| } |
| return nil, nil, &tcpip.ErrClosedForReceive{} |
| } |
| e.stats.ReadErrors.NotConnected.Increment() |
| return nil, nil, &tcpip.ErrNotConnected{} |
| } |
| |
| if e.rcvBufUsed == 0 { |
| if e.rcvClosed || !e.EndpointState().connected() { |
| return nil, nil, &tcpip.ErrClosedForReceive{} |
| } |
| return nil, nil, &tcpip.ErrWouldBlock{} |
| } |
| |
| return e.rcvList.Front(), e.rcvList.Back(), nil |
| } |
| |
| // commitRead commits a read of done bytes and returns the next non-empty |
| // segment to read. Data read from the segment must have also been removed from |
| // the segment in order for this method to work correctly. |
| // |
| // It is performance critical to call commitRead frequently when servicing a big |
| // Read request, so TCP can make progress timely. Right now, it is designed to |
| // do this per segment read, hence this method conveniently returns the next |
| // segment to read while holding the lock. |
| // |
| // Precondition: e.rcvReadMu must be held. |
| func (e *endpoint) commitRead(done int) *segment { |
| e.LockUser() |
| defer e.UnlockUser() |
| e.rcvListMu.Lock() |
| defer e.rcvListMu.Unlock() |
| |
| memDelta := 0 |
| s := e.rcvList.Front() |
| for s != nil && s.data.Size() == 0 { |
| e.rcvList.Remove(s) |
| // Memory is only considered released when the whole segment has been |
| // read. |
| memDelta += s.segMemSize() |
| s.decRef() |
| s = e.rcvList.Front() |
| } |
| e.rcvBufUsed -= done |
| |
| if memDelta > 0 { |
| // If the window was small before this read and if the read freed up |
| // enough buffer space, to either fit an aMSS or half a receive buffer |
| // (whichever smaller), then notify the protocol goroutine to send a |
| // window update. |
| if crossed, above := e.windowCrossedACKThresholdLocked(memDelta); crossed && above { |
| e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow) |
| } |
| } |
| |
| return e.rcvList.Front() |
| } |
| |
| // isEndpointWritableLocked checks if a given endpoint is writable |
| // and also returns the number of bytes that can be written at this |
| // moment. If the endpoint is not writable then it returns an error |
| // indicating the reason why it's not writable. |
| // Caller must hold e.mu and e.sndBufMu |
| func (e *endpoint) isEndpointWritableLocked() (int, tcpip.Error) { |
| // The endpoint cannot be written to if it's not connected. |
| switch s := e.EndpointState(); { |
| case s == StateError: |
| if err := e.hardErrorLocked(); err != nil { |
| return 0, err |
| } |
| return 0, &tcpip.ErrClosedForSend{} |
| case !s.connecting() && !s.connected(): |
| return 0, &tcpip.ErrClosedForSend{} |
| case s.connecting(): |
| // As per RFC793, page 56, a send request arriving when in connecting |
| // state, can be queued to be completed after the state becomes |
| // connected. Return an error code for the caller of endpoint Write to |
| // try again, until the connection handshake is complete. |
| return 0, &tcpip.ErrWouldBlock{} |
| } |
| |
| // Check if the connection has already been closed for sends. |
| if e.sndClosed { |
| return 0, &tcpip.ErrClosedForSend{} |
| } |
| |
| sndBufSize := e.getSendBufferSize() |
| avail := sndBufSize - e.sndBufUsed |
| if avail <= 0 { |
| return 0, &tcpip.ErrWouldBlock{} |
| } |
| return avail, nil |
| } |
| |
| // Write writes data to the endpoint's peer. |
| func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { |
| // Linux completely ignores any address passed to sendto(2) for TCP sockets |
| // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More |
| // and opts.EndOfRecord are also ignored. |
| |
| e.LockUser() |
| defer e.UnlockUser() |
| |
| nextSeg, n, err := func() (*segment, int, tcpip.Error) { |
| e.sndBufMu.Lock() |
| defer e.sndBufMu.Unlock() |
| |
| avail, err := e.isEndpointWritableLocked() |
| if err != nil { |
| e.stats.WriteErrors.WriteClosed.Increment() |
| return nil, 0, err |
| } |
| |
| v, err := func() ([]byte, tcpip.Error) { |
| // We can release locks while copying data. |
| // |
| // This is not possible if atomic is set, because we can't allow the |
| // available buffer space to be consumed by some other caller while we |
| // are copying data in. |
| if !opts.Atomic { |
| e.sndBufMu.Unlock() |
| defer e.sndBufMu.Lock() |
| |
| e.UnlockUser() |
| defer e.LockUser() |
| } |
| |
| // Fetch data. |
| if l := p.Len(); l < avail { |
| avail = l |
| } |
| if avail == 0 { |
| return nil, nil |
| } |
| v := make([]byte, avail) |
| n, err := p.Read(v) |
| if err != nil && err != io.EOF { |
| return nil, &tcpip.ErrBadBuffer{} |
| } |
| return v[:n], nil |
| }() |
| if len(v) == 0 || err != nil { |
| return nil, 0, err |
| } |
| |
| if !opts.Atomic { |
| // Since we released locks in between it's possible that the |
| // endpoint transitioned to a CLOSED/ERROR states so make |
| // sure endpoint is still writable before trying to write. |
| avail, err := e.isEndpointWritableLocked() |
| if err != nil { |
| e.stats.WriteErrors.WriteClosed.Increment() |
| return nil, 0, err |
| } |
| |
| // Discard any excess data copied in due to avail being reduced due |
| // to a simultaneous write call to the socket. |
| if avail < len(v) { |
| v = v[:avail] |
| } |
| } |
| |
| // Add data to the send queue. |
| s := newOutgoingSegment(e.ID, v) |
| e.sndBufUsed += len(v) |
| e.sndBufInQueue += seqnum.Size(len(v)) |
| e.sndQueue.PushBack(s) |
| |
| return e.drainSendQueueLocked(), len(v), nil |
| }() |
| // Return if either we didn't queue anything or if an error occurred while |
| // attempting to queue data. |
| if n == 0 || err != nil { |
| return 0, err |
| } |
| e.sendData(nextSeg) |
| return int64(n), nil |
| } |
| |
| // selectWindowLocked returns the new window without checking for shrinking or scaling |
| // applied. |
| // Precondition: e.mu and e.rcvListMu must be held. |
| func (e *endpoint) selectWindowLocked() (wnd seqnum.Size) { |
| wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked()) |
| maxWindow := wndFromSpace(e.rcvBufSize) |
| wndFromUsedBytes := maxWindow - e.rcvBufUsed |
| |
| // We take the lesser of the wndFromAvailable and wndFromUsedBytes because in |
| // cases where we receive a lot of small segments the segment overhead is a |
| // lot higher and we can run out socket buffer space before we can fill the |
| // previous window we advertised. In cases where we receive MSS sized or close |
| // MSS sized segments we will probably run out of window space before we |
| // exhaust receive buffer. |
| newWnd := wndFromAvailable |
| if newWnd > wndFromUsedBytes { |
| newWnd = wndFromUsedBytes |
| } |
| if newWnd < 0 { |
| newWnd = 0 |
| } |
| return seqnum.Size(newWnd) |
| } |
| |
| // selectWindow invokes selectWindowLocked after acquiring e.rcvListMu. |
| func (e *endpoint) selectWindow() (wnd seqnum.Size) { |
| e.rcvListMu.Lock() |
| wnd = e.selectWindowLocked() |
| e.rcvListMu.Unlock() |
| return wnd |
| } |
| |
| // windowCrossedACKThresholdLocked checks if the receive window to be announced |
| // would be under aMSS or under the window derived from half receive buffer, |
| // whichever smaller. This is useful as a receive side silly window syndrome |
| // prevention mechanism. If window grows to reasonable value, we should send ACK |
| // to the sender to inform the rx space is now large. We also want ensure a |
| // series of small read()'s won't trigger a flood of spurious tiny ACK's. |
| // |
| // For large receive buffers, the threshold is aMSS - once reader reads more |
| // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of |
| // receive buffer size. This is chosen arbitrairly. |
| // crossed will be true if the window size crossed the ACK threshold. |
| // above will be true if the new window is >= ACK threshold and false |
| // otherwise. |
| // |
| // Precondition: e.mu and e.rcvListMu must be held. |
| func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int) (crossed bool, above bool) { |
| newAvail := int(e.selectWindowLocked()) |
| oldAvail := newAvail - deltaBefore |
| if oldAvail < 0 { |
| oldAvail = 0 |
| } |
| threshold := int(e.amss) |
| // rcvBufFraction is the inverse of the fraction of receive buffer size that |
| // is used to decide if the available buffer space is now above it. |
| const rcvBufFraction = 2 |
| if wndThreshold := wndFromSpace(e.rcvBufSize / rcvBufFraction); threshold > wndThreshold { |
| threshold = wndThreshold |
| } |
| switch { |
| case oldAvail < threshold && newAvail >= threshold: |
| return true, true |
| case oldAvail >= threshold && newAvail < threshold: |
| return true, false |
| } |
| return false, false |
| } |
| |
| // OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet. |
| func (e *endpoint) OnReuseAddressSet(v bool) { |
| e.LockUser() |
| e.portFlags.TupleOnly = v |
| e.UnlockUser() |
| } |
| |
| // OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet. |
| func (e *endpoint) OnReusePortSet(v bool) { |
| e.LockUser() |
| e.portFlags.LoadBalanced = v |
| e.UnlockUser() |
| } |
| |
| // OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet. |
| func (e *endpoint) OnKeepAliveSet(bool) { |
| e.notifyProtocolGoroutine(notifyKeepaliveChanged) |
| } |
| |
| // OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet. |
| func (e *endpoint) OnDelayOptionSet(v bool) { |
| if !v { |
| // Handle delayed data. |
| e.sndWaker.Assert() |
| } |
| } |
| |
| // OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet. |
| func (e *endpoint) OnCorkOptionSet(v bool) { |
| if !v { |
| // Handle the corked data. |
| e.sndWaker.Assert() |
| } |
| } |
| |
| func (e *endpoint) getSendBufferSize() int { |
| return int(e.ops.GetSendBufferSize()) |
| } |
| |
| // SetSockOptInt sets a socket option. |
| func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { |
| // Lower 2 bits represents ECN bits. RFC 3168, section 23.1 |
| const inetECNMask = 3 |
| |
| switch opt { |
| case tcpip.KeepaliveCountOption: |
| e.keepalive.Lock() |
| e.keepalive.count = v |
| e.keepalive.Unlock() |
| e.notifyProtocolGoroutine(notifyKeepaliveChanged) |
| |
| case tcpip.IPv4TOSOption: |
| e.LockUser() |
| // TODO(gvisor.dev/issue/995): ECN is not currently supported, |
| // ignore the bits for now. |
| e.sendTOS = uint8(v) & ^uint8(inetECNMask) |
| e.UnlockUser() |
| |
| case tcpip.IPv6TrafficClassOption: |
| e.LockUser() |
| // TODO(gvisor.dev/issue/995): ECN is not currently supported, |
| // ignore the bits for now. |
| e.sendTOS = uint8(v) & ^uint8(inetECNMask) |
| e.UnlockUser() |
| |
| case tcpip.MaxSegOption: |
| userMSS := v |
| if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS { |
| return &tcpip.ErrInvalidOptionValue{} |
| } |
| e.LockUser() |
| e.userMSS = uint16(userMSS) |
| e.UnlockUser() |
| e.notifyProtocolGoroutine(notifyMSSChanged) |
| |
| case tcpip.MTUDiscoverOption: |
| // Return not supported if attempting to set this option to |
| // anything other than path MTU discovery disabled. |
| if v != tcpip.PMTUDiscoveryDont { |
| return &tcpip.ErrNotSupported{} |
| } |
| |
| case tcpip.ReceiveBufferSizeOption: |
| // Make sure the receive buffer size is within the min and max |
| // allowed. |
| var rs tcpip.TCPReceiveBufferSizeRangeOption |
| if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil { |
| panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &rs, err)) |
| } |
| |
| if v > rs.Max { |
| v = rs.Max |
| } |
| |
| if v < math.MaxInt32/SegOverheadFactor { |
| v *= SegOverheadFactor |
| if v < rs.Min { |
| v = rs.Min |
| } |
| } else { |
| v = math.MaxInt32 |
| } |
| |
| e.LockUser() |
| e.rcvListMu.Lock() |
| |
| // Make sure the receive buffer size allows us to send a |
| // non-zero window size. |
| scale := uint8(0) |
| if e.rcv != nil { |
| scale = e.rcv.rcvWndScale |
| } |
| if v>>scale == 0 { |
| v = 1 << scale |
| } |
| |
| availBefore := wndFromSpace(e.receiveBufferAvailableLocked()) |
| e.rcvBufSize = v |
| availAfter := wndFromSpace(e.receiveBufferAvailableLocked()) |
| |
| e.rcvAutoParams.disabled = true |
| |
| // Immediately send an ACK to uncork the sender silly window |
| // syndrome prevetion, when our available space grows above aMSS |
| // or half receive buffer, whichever smaller. |
| if crossed, above := e.windowCrossedACKThresholdLocked(availAfter - availBefore); crossed && above { |
| e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow) |
| } |
| |
| e.rcvListMu.Unlock() |
| e.UnlockUser() |
| |
| case tcpip.TTLOption: |
| e.LockUser() |
| e.ttl = uint8(v) |
| e.UnlockUser() |
| |
| case tcpip.TCPSynCountOption: |
| if v < 1 || v > 255 { |
| return &tcpip.ErrInvalidOptionValue{} |
| } |
| e.LockUser() |
| e.maxSynRetries = uint8(v) |
| e.UnlockUser() |
| |
| case tcpip.TCPWindowClampOption: |
| if v == 0 { |
| e.LockUser() |
| switch e.EndpointState() { |
| case StateClose, StateInitial: |
| e.windowClamp = 0 |
| e.UnlockUser() |
| return nil |
| default: |
| e.UnlockUser() |
| return &tcpip.ErrInvalidOptionValue{} |
| } |
| } |
| var rs tcpip.TCPReceiveBufferSizeRangeOption |
| if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil { |
| if v < rs.Min/2 { |
| v = rs.Min / 2 |
| } |
| } |
| e.LockUser() |
| e.windowClamp = uint32(v) |
| e.UnlockUser() |
| } |
| return nil |
| } |
| |
| func (e *endpoint) HasNIC(id int32) bool { |
| return id == 0 || e.stack.HasNIC(tcpip.NICID(id)) |
| } |
| |
| // SetSockOpt sets a socket option. |
| func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { |
| switch v := opt.(type) { |
| case *tcpip.KeepaliveIdleOption: |
| e.keepalive.Lock() |
| e.keepalive.idle = time.Duration(*v) |
| e.keepalive.Unlock() |
| e.notifyProtocolGoroutine(notifyKeepaliveChanged) |
| |
| case *tcpip.KeepaliveIntervalOption: |
| e.keepalive.Lock() |
| e.keepalive.interval = time.Duration(*v) |
| e.keepalive.Unlock() |
| e.notifyProtocolGoroutine(notifyKeepaliveChanged) |
| |
| case *tcpip.TCPUserTimeoutOption: |
| e.LockUser() |
| e.userTimeout = time.Duration(*v) |
| e.UnlockUser() |
| |
| case *tcpip.CongestionControlOption: |
| // Query the available cc algorithms in the stack and |
| // validate that the specified algorithm is actually |
| // supported in the stack. |
| var avail tcpip.TCPAvailableCongestionControlOption |
| if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil { |
| return err |
| } |
| availCC := strings.Split(string(avail), " ") |
| for _, cc := range availCC { |
| if *v == tcpip.CongestionControlOption(cc) { |
| e.LockUser() |
| state := e.EndpointState() |
| e.cc = *v |
| switch state { |
| case StateEstablished: |
| if e.EndpointState() == state { |
| e.snd.cc = e.snd.initCongestionControl(e.cc) |
| } |
| } |
| e.UnlockUser() |
| return nil |
| } |
| } |
| |
| // Linux returns ENOENT when an invalid congestion |
| // control algorithm is specified. |
| return &tcpip.ErrNoSuchFile{} |
| |
| case *tcpip.TCPLingerTimeoutOption: |
| e.LockUser() |
| |
| switch { |
| case *v < 0: |
| // Same as effectively disabling TCPLinger timeout. |
| *v = -1 |
| case *v == 0: |
| // Same as the stack default. |
| var stackLingerTimeout tcpip.TCPLingerTimeoutOption |
| if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil { |
| panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err)) |
| } |
| *v = stackLingerTimeout |
| case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout): |
| // Cap it to Stack's default TCP_LINGER2 timeout. |
| *v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout) |
| default: |
| } |
| |
| e.tcpLingerTimeout = time.Duration(*v) |
| e.UnlockUser() |
| |
| case *tcpip.TCPDeferAcceptOption: |
| e.LockUser() |
| if time.Duration(*v) > MaxRTO { |
| *v = tcpip.TCPDeferAcceptOption(MaxRTO) |
| } |
| e.deferAccept = time.Duration(*v) |
| e.UnlockUser() |
| |
| case *tcpip.SocketDetachFilterOption: |
| return nil |
| |
| default: |
| return nil |
| } |
| return nil |
| } |
| |
| // readyReceiveSize returns the number of bytes ready to be received. |
| func (e *endpoint) readyReceiveSize() (int, tcpip.Error) { |
| e.LockUser() |
| defer e.UnlockUser() |
| |
| // The endpoint cannot be in listen state. |
| if e.EndpointState() == StateListen { |
| return 0, &tcpip.ErrInvalidEndpointState{} |
| } |
| |
| e.rcvListMu.Lock() |
| defer e.rcvListMu.Unlock() |
| |
| return e.rcvBufUsed, nil |
| } |
| |
| // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. |
| func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { |
| switch opt { |
| case tcpip.KeepaliveCountOption: |
| e.keepalive.Lock() |
| v := e.keepalive.count |
| e.keepalive.Unlock() |
| return v, nil |
| |
| case tcpip.IPv4TOSOption: |
| e.LockUser() |
| v := int(e.sendTOS) |
| e.UnlockUser() |
| return v, nil |
| |
| case tcpip.IPv6TrafficClassOption: |
| e.LockUser() |
| v := int(e.sendTOS) |
| e.UnlockUser() |
| return v, nil |
| |
| case tcpip.MaxSegOption: |
| // This is just stubbed out. Linux never returns the user_mss |
| // value as it either returns the defaultMSS or returns the |
| // actual current MSS. Netstack just returns the defaultMSS |
| // always for now. |
| v := header.TCPDefaultMSS |
| return v, nil |
| |
| case tcpip.MTUDiscoverOption: |
| // Always return the path MTU discovery disabled setting since |
| // it's the only one supported. |
| return tcpip.PMTUDiscoveryDont, nil |
| |
| case tcpip.ReceiveQueueSizeOption: |
| return e.readyReceiveSize() |
| |
| case tcpip.ReceiveBufferSizeOption: |
| e.rcvListMu.Lock() |
| v := e.rcvBufSize |
| e.rcvListMu.Unlock() |
| return v, nil |
| |
| case tcpip.TTLOption: |
| e.LockUser() |
| v := int(e.ttl) |
| e.UnlockUser() |
| return v, nil |
| |
| case tcpip.TCPSynCountOption: |
| e.LockUser() |
| v := int(e.maxSynRetries) |
| e.UnlockUser() |
| return v, nil |
| |
| case tcpip.TCPWindowClampOption: |
| e.LockUser() |
| v := int(e.windowClamp) |
| e.UnlockUser() |
| return v, nil |
| |
| case tcpip.MulticastTTLOption: |
| return 1, nil |
| |
| default: |
| return -1, &tcpip.ErrUnknownProtocolOption{} |
| } |
| } |
| |
| func (e *endpoint) getTCPInfo() tcpip.TCPInfoOption { |
| info := tcpip.TCPInfoOption{} |
| e.LockUser() |
| snd := e.snd |
| if snd != nil { |
| // We do not calculate RTT before sending the data packets. If |
| // the connection did not send and receive data, then RTT will |
| // be zero. |
| snd.rtt.Lock() |
| info.RTT = snd.rtt.srtt |
| info.RTTVar = snd.rtt.rttvar |
| snd.rtt.Unlock() |
| |
| info.RTO = snd.rto |
| info.CcState = snd.state |
| info.SndSsthresh = uint32(snd.sndSsthresh) |
| info.SndCwnd = uint32(snd.sndCwnd) |
| info.ReorderSeen = snd.rc.reorderSeen |
| } |
| e.UnlockUser() |
| return info |
| } |
| |
| // GetSockOpt implements tcpip.Endpoint.GetSockOpt. |
| func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { |
| switch o := opt.(type) { |
| case *tcpip.TCPInfoOption: |
| *o = e.getTCPInfo() |
| |
| case *tcpip.KeepaliveIdleOption: |
| e.keepalive.Lock() |
| *o = tcpip.KeepaliveIdleOption(e.keepalive.idle) |
| e.keepalive.Unlock() |
| |
| case *tcpip.KeepaliveIntervalOption: |
| e.keepalive.Lock() |
| *o = tcpip.KeepaliveIntervalOption(e.keepalive.interval) |
| e.keepalive.Unlock() |
| |
| case *tcpip.TCPUserTimeoutOption: |
| e.LockUser() |
| *o = tcpip.TCPUserTimeoutOption(e.userTimeout) |
| e.UnlockUser() |
| |
| case *tcpip.CongestionControlOption: |
| e.LockUser() |
| *o = e.cc |
| e.UnlockUser() |
| |
| case *tcpip.TCPLingerTimeoutOption: |
| e.LockUser() |
| *o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout) |
| e.UnlockUser() |
| |
| case *tcpip.TCPDeferAcceptOption: |
| e.LockUser() |
| *o = tcpip.TCPDeferAcceptOption(e.deferAccept) |
| e.UnlockUser() |
| |
| case *tcpip.OriginalDestinationOption: |
| e.LockUser() |
| ipt := e.stack.IPTables() |
| addr, port, err := ipt.OriginalDst(e.ID, e.NetProto) |
| e.UnlockUser() |
| if err != nil { |
| return err |
| } |
| *o = tcpip.OriginalDestinationOption{ |
| Addr: addr, |
| Port: port, |
| } |
| |
| default: |
| return &tcpip.ErrUnknownProtocolOption{} |
| } |
| return nil |
| } |
| |
| // checkV4MappedLocked determines the effective network protocol and converts |
| // addr to its canonical form. |
| func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) { |
| unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only()) |
| if err != nil { |
| return tcpip.FullAddress{}, 0, err |
| } |
| return unwrapped, netProto, nil |
| } |
| |
| // Disconnect implements tcpip.Endpoint.Disconnect. |
| func (*endpoint) Disconnect() tcpip.Error { |
| return &tcpip.ErrNotSupported{} |
| } |
| |
| // Connect connects the endpoint to its peer. |
| func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { |
| err := e.connect(addr, true, true) |
| if err != nil { |
| if !err.IgnoreStats() { |
| // Connect failed. Let's wake up any waiters. |
| e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) |
| e.stack.Stats().TCP.FailedConnectionAttempts.Increment() |
| e.stats.FailedConnectionAttempts.Increment() |
| } |
| } |
| return err |
| } |
| |
| // connect connects the endpoint to its peer. In the normal non-S/R case, the |
| // new connection is expected to run the main goroutine and perform handshake. |
| // In restore of previously connected endpoints, both ends will be passively |
| // created (so no new handshaking is done); for stack-accepted connections not |
| // yet accepted by the app, they are restored without running the main goroutine |
| // here. |
| func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) tcpip.Error { |
| e.LockUser() |
| defer e.UnlockUser() |
| |
| connectingAddr := addr.Addr |
| |
| addr, netProto, err := e.checkV4MappedLocked(addr) |
| if err != nil { |
| return err |
| } |
| |
| if e.EndpointState().connected() { |
| // The endpoint is already connected. If caller hasn't been |
| // notified yet, return success. |
| if !e.isConnectNotified { |
| e.isConnectNotified = true |
| return nil |
| } |
| // Otherwise return that it's already connected. |
| return &tcpip.ErrAlreadyConnected{} |
| } |
| |
| nicID := addr.NIC |
| switch e.EndpointState() { |
| case StateBound: |
| // If we're already bound to a NIC but the caller is requesting |
| // that we use a different one now, we cannot proceed. |
| if e.boundNICID == 0 { |
| break |
| } |
| |
| if nicID != 0 && nicID != e.boundNICID { |
| return &tcpip.ErrNoRoute{} |
| } |
| |
| nicID = e.boundNICID |
| |
| case StateInitial: |
| // Nothing to do. We'll eventually fill-in the gaps in the ID (if any) |
| // when we find a route. |
| |
| case StateConnecting, StateSynSent, StateSynRecv: |
| // A connection request has already been issued but hasn't completed |
| // yet. |
| return &tcpip.ErrAlreadyConnecting{} |
| |
| case StateError: |
| if err := e.hardErrorLocked(); err != nil { |
| return err |
| } |
| return &tcpip.ErrConnectionAborted{} |
| |
| default: |
| return &tcpip.ErrInvalidEndpointState{} |
| } |
| |
| // Find a route to the desired destination. |
| r, err := e.stack.FindRoute(nicID, e.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */) |
| if err != nil { |
| return err |
| } |
| defer r.Release() |
| |
| netProtos := []tcpip.NetworkProtocolNumber{netProto} |
| e.ID.LocalAddress = r.LocalAddress() |
| e.ID.RemoteAddress = r.RemoteAddress() |
| e.ID.RemotePort = addr.Port |
| |
| if e.ID.LocalPort != 0 { |
| // The endpoint is bound to a port, attempt to register it. |
| err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice) |
| if err != nil { |
| return err |
| } |
| } else { |
| // The endpoint doesn't have a local port yet, so try to get |
| // one. Make sure that it isn't one that will result in the same |
| // address/port for both local and remote (otherwise this |
| // endpoint would be trying to connect to itself). |
| sameAddr := e.ID.LocalAddress == e.ID.RemoteAddress |
| |
| // Calculate a port offset based on the destination IP/port and |
| // src IP to ensure that for a given tuple (srcIP, destIP, |
| // destPort) the offset used as a starting point is the same to |
| // ensure that we can cycle through the port space effectively. |
| portBuf := make([]byte, 2) |
| binary.LittleEndian.PutUint16(portBuf, e.ID.RemotePort) |
| |
| h := jenkins.Sum32(e.stack.Seed()) |
| for _, s := range [][]byte{ |
| []byte(e.ID.LocalAddress), |
| []byte(e.ID.RemoteAddress), |
| portBuf, |
| } { |
| // Per io.Writer.Write: |
| // |
| // Write must return a non-nil error if it returns n < len(p). |
| if _, err := h.Write(s); err != nil { |
| panic(err) |
| } |
| } |
| portOffset := uint16(h.Sum32()) |
| |
| var twReuse tcpip.TCPTimeWaitReuseOption |
| if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil { |
| panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err)) |
| } |
| |
| reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal |
| if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly { |
| switch netProto { |
| case header.IPv4ProtocolNumber: |
| reuse = header.IsV4LoopbackAddress(e.ID.LocalAddress) && header.IsV4LoopbackAddress(e.ID.RemoteAddress) |
| case header.IPv6ProtocolNumber: |
| reuse = e.ID.LocalAddress == header.IPv6Loopback && e.ID.RemoteAddress == header.IPv6Loopback |
| } |
| } |
| |
| bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) |
| if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, tcpip.Error) { |
| if sameAddr && p == e.ID.RemotePort { |
| return false, nil |
| } |
| portRes := ports.Reservation{ |
| Networks: netProtos, |
| Transport: ProtocolNumber, |
| Addr: e.ID.LocalAddress, |
| Port: p, |
| Flags: e.portFlags, |
| BindToDevice: bindToDevice, |
| Dest: addr, |
| } |
| if _, err := e.stack.ReservePort(portRes, nil /* testPort */); err != nil { |
| if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse { |
| return false, nil |
| } |
| transEPID := e.ID |
| transEPID.LocalPort = p |
| // Check if an endpoint is registered with demuxer in TIME-WAIT and if |
| // we can reuse it. If we can't find a transport endpoint then we just |
| // skip using this port as it's possible that either an endpoint has |
| // bound the port but not registered with demuxer yet (no listen/connect |
| // done yet) or the reservation was freed between the check above and |
| // the FindTransportEndpoint below. But rather than retry the same port |
| // we just skip it and move on. |
| transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, r.NICID()) |
| if transEP == nil { |
| // ReservePort failed but there is no registered endpoint with |
| // demuxer. Which indicates there is at least some endpoint that has |
| // bound the port. |
| return false, nil |
| } |
| |
| tcpEP := transEP.(*endpoint) |
| tcpEP.LockUser() |
| // If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but |
| // less than 1 second has elapsed since its recentTS was updated then |
| // we cannot reuse the port. |
| if tcpEP.EndpointState() != StateTimeWait || time.Since(tcpEP.recentTSTime) < 1*time.Second { |
| tcpEP.UnlockUser() |
| return false, nil |
| } |
| // Since the endpoint is in TIME-WAIT it should be safe to acquire its |
| // Lock while holding the lock for this endpoint as endpoints in |
| // TIME-WAIT do not acquire locks on other endpoints. |
| tcpEP.workerCleanup = false |
| tcpEP.cleanupLocked() |
| tcpEP.notifyProtocolGoroutine(notifyAbort) |
| tcpEP.UnlockUser() |
| // Now try and Reserve again if it fails then we skip. |
| portRes := ports.Reservation{ |
| Networks: netProtos, |
| Transport: ProtocolNumber, |
| Addr: e.ID.LocalAddress, |
| Port: p, |
| Flags: e.portFlags, |
| BindToDevice: bindToDevice, |
| Dest: addr, |
| } |
| if _, err := e.stack.ReservePort(portRes, nil /* testPort */); err != nil { |
| return false, nil |
| } |
| } |
| |
| id := e.ID |
| id.LocalPort = p |
| if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil { |
| portRes := ports.Reservation{ |
| Networks: netProtos, |
| Transport: ProtocolNumber, |
| Addr: e.ID.LocalAddress, |
| Port: p, |
| Flags: e.portFlags, |
| BindToDevice: bindToDevice, |
| Dest: addr, |
| } |
| e.stack.ReleasePort(portRes) |
| if _, ok := err.(*tcpip.ErrPortInUse); ok { |
| return false, nil |
| } |
| return false, err |
| } |
| |
| // Port picking successful. Save the details of |
| // the selected port. |
| e.ID = id |
| e.isPortReserved = true |
| e.boundBindToDevice = bindToDevice |
| e.boundPortFlags = e.portFlags |
| e.boundDest = addr |
| return true, nil |
| }); err != nil { |
| return err |
| } |
| } |
| |
| e.isRegistered = true |
| e.setEndpointState(StateConnecting) |
| r.Acquire() |
| e.route = r |
| e.boundNICID = nicID |
| e.effectiveNetProtos = netProtos |
| e.connectingAddress = connectingAddr |
| |
| e.initGSO() |
| |
| // Connect in the restore phase does not perform handshake. Restore its |
| // connection setting here. |
| if !handshake { |
| e.segmentQueue.mu.Lock() |
| for _, l := range []segmentList{e.segmentQueue.list, e.sndQueue, e.snd.writeList} { |
| for s := l.Front(); s != nil; s = s.Next() { |
| s.id = e.ID |
| e.sndWaker.Assert() |
| } |
| } |
| e.segmentQueue.mu.Unlock() |
| e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0) |
| e.setEndpointState(StateEstablished) |
| } |
| |
| if run { |
| if handshake { |
| h := e.newHandshake() |
| e.setEndpointState(StateSynSent) |
| h.start() |
| } |
| e.stack.Stats().TCP.ActiveConnectionOpenings.Increment() |
| e.workerRunning = true |
| go e.protocolMainLoop(handshake, nil) // S/R-SAFE: will be drained before save. |
| } |
| |
| return &tcpip.ErrConnectStarted{} |
| } |
| |
| // ConnectEndpoint is not supported. |
| func (*endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error { |
| return &tcpip.ErrInvalidEndpointState{} |
| } |
| |
| // Shutdown closes the read and/or write end of the endpoint connection to its |
| // peer. |
| func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error { |
| e.LockUser() |
| defer e.UnlockUser() |
| return e.shutdownLocked(flags) |
| } |
| |
| func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error { |
| e.shutdownFlags |= flags |
| switch { |
| case e.EndpointState().connected(): |
| // Close for read. |
| if e.shutdownFlags&tcpip.ShutdownRead != 0 { |
| // Mark read side as closed. |
| e.rcvListMu.Lock() |
| e.rcvClosed = true |
| rcvBufUsed := e.rcvBufUsed |
| e.rcvListMu.Unlock() |
| |
| // If we're fully closed and we have unread data we need to abort |
| // the connection with a RST. |
| if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 { |
| e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) |
| // Wake up worker to terminate loop. |
| e.notifyProtocolGoroutine(notifyTickleWorker) |
| return nil |
| } |
| } |
| |
| // Close for write. |
| if e.shutdownFlags&tcpip.ShutdownWrite != 0 { |
| e.sndBufMu.Lock() |
| if e.sndClosed { |
| // Already closed. |
| e.sndBufMu.Unlock() |
| if e.EndpointState() == StateTimeWait { |
| return &tcpip.ErrNotConnected{} |
| } |
| return nil |
| } |
| |
| // Queue fin segment. |
| s := newOutgoingSegment(e.ID, nil) |
| e.sndQueue.PushBack(s) |
| e.sndBufInQueue++ |
| // Mark endpoint as closed. |
| e.sndClosed = true |
| e.sndBufMu.Unlock() |
| e.handleClose() |
| } |
| |
| return nil |
| case e.EndpointState() == StateListen: |
| if e.shutdownFlags&tcpip.ShutdownRead != 0 { |
| // Reset all connections from the accept queue and keep the |
| // worker running so that it can continue handling incoming |
| // segments by replying with RST. |
| // |
| // By not removing this endpoint from the demuxer mapping, we |
| // ensure that any other bind to the same port fails, as on Linux. |
| e.rcvListMu.Lock() |
| e.rcvClosed = true |
| e.rcvListMu.Unlock() |
| e.closePendingAcceptableConnectionsLocked() |
| // Notify waiters that the endpoint is shutdown. |
| e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) |
| } |
| return nil |
| default: |
| return &tcpip.ErrNotConnected{} |
| } |
| } |
| |
| // Listen puts the endpoint in "listen" mode, which allows it to accept |
| // new connections. |
| func (e *endpoint) Listen(backlog int) tcpip.Error { |
| // Accept one more than the configured listen backlog to keep in parity with |
| // Linux. Ref, because of missing equality check here: |
| // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/sock.h#L937 |
| backlog++ |
| err := e.listen(backlog) |
| if err != nil { |
| if !err.IgnoreStats() { |
| e.stack.Stats().TCP.FailedConnectionAttempts.Increment() |
| e.stats.FailedConnectionAttempts.Increment() |
| } |
| } |
| return err |
| } |
| |
| func (e *endpoint) listen(backlog int) tcpip.Error { |
| e.LockUser() |
| defer e.UnlockUser() |
| |
| if e.EndpointState() == StateListen && !e.closed { |
| e.acceptMu.Lock() |
| defer e.acceptMu.Unlock() |
| if e.accepted == (accepted{}) { |
| // listen is called after shutdown. |
| e.accepted.cap = backlog |
| e.shutdownFlags = 0 |
| e.rcvListMu.Lock() |
| e.rcvClosed = false |
| e.rcvListMu.Unlock() |
| } else { |
| // Adjust the size of the backlog iff we can fit |
| // existing pending connections into the new one. |
| if e.accepted.endpoints.Len() > backlog { |
| return &tcpip.ErrInvalidEndpointState{} |
| } |
| e.accepted.cap = backlog |
| } |
| |
| // Notify any blocked goroutines that they can attempt to |
| // deliver endpoints again. |
| e.acceptCond.Broadcast() |
| |
| return nil |
| } |
| |
| if e.EndpointState() == StateInitial { |
| // The listen is called on an unbound socket, the socket is |
| // automatically bound to a random free port with the local |
| // address set to INADDR_ANY. |
| if err := e.bindLocked(tcpip.FullAddress{}); err != nil { |
| return err |
| } |
| } |
| |
| // Endpoint must be bound before it can transition to listen mode. |
| if e.EndpointState() != StateBound { |
| e.stats.ReadErrors.InvalidEndpointState.Increment() |
| return &tcpip.ErrInvalidEndpointState{} |
| } |
| |
| // Register the endpoint. |
| if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil { |
| return err |
| } |
| |
| e.isRegistered = true |
| e.setEndpointState(StateListen) |
| |
| // The queue may be non-zero when we're restoring the endpoint, and it |
| // may be pre-populated with some previously accepted (but not Accepted) |
| // endpoints. |
| e.acceptMu.Lock() |
| if e.accepted == (accepted{}) { |
| e.accepted.cap = backlog |
| } |
| e.acceptMu.Unlock() |
| |
| e.workerRunning = true |
| go e.protocolListenLoop( // S/R-SAFE: drained on save. |
| seqnum.Size(e.receiveBufferAvailable())) |
| return nil |
| } |
| |
| // startAcceptedLoop sets up required state and starts a goroutine with the |
| // main loop for accepted connections. |
| func (e *endpoint) startAcceptedLoop() { |
| e.workerRunning = true |
| e.mu.Unlock() |
| wakerInitDone := make(chan struct{}) |
| go e.protocolMainLoop(false, wakerInitDone) // S/R-SAFE: drained on save. |
| <-wakerInitDone |
| } |
| |
| // Accept returns a new endpoint if a peer has established a connection |
| // to an endpoint previously set to listen mode. |
| // |
| // addr if not-nil will contain the peer address of the returned endpoint. |
| func (e *endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { |
| e.LockUser() |
| defer e.UnlockUser() |
| |
| e.rcvListMu.Lock() |
| rcvClosed := e.rcvClosed |
| e.rcvListMu.Unlock() |
| // Endpoint must be in listen state before it can accept connections. |
| if rcvClosed || e.EndpointState() != StateListen { |
| return nil, nil, &tcpip.ErrInvalidEndpointState{} |
| } |
| |
| // Get the new accepted endpoint. |
| var n *endpoint |
| e.acceptMu.Lock() |
| if element := e.accepted.endpoints.Front(); element != nil { |
| n = e.accepted.endpoints.Remove(element).(*endpoint) |
| } |
| e.acceptMu.Unlock() |
| if n == nil { |
| return nil, nil, &tcpip.ErrWouldBlock{} |
| } |
| e.acceptCond.Signal() |
| if peerAddr != nil { |
| *peerAddr = n.getRemoteAddress() |
| } |
| return n, n.waiterQueue, nil |
| } |
| |
| // Bind binds the endpoint to a specific local port and optionally address. |
| func (e *endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) { |
| e.LockUser() |
| defer e.UnlockUser() |
| |
| return e.bindLocked(addr) |
| } |
| |
| func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) { |
| // Don't allow binding once endpoint is not in the initial state |
| // anymore. This is because once the endpoint goes into a connected or |
| // listen state, it is already bound. |
| if e.EndpointState() != StateInitial { |
| return &tcpip.ErrAlreadyBound{} |
| } |
| |
| e.BindAddr = addr.Addr |
| addr, netProto, err := e.checkV4MappedLocked(addr) |
| if err != nil { |
| return err |
| } |
| |
| netProtos := []tcpip.NetworkProtocolNumber{netProto} |
| |
| // Expand netProtos to include v4 and v6 under dual-stack if the caller is |
| // binding to a wildcard (empty) address, and this is an IPv6 endpoint with |
| // v6only set to false. |
| if netProto == header.IPv6ProtocolNumber { |
| stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber) |
| alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == "" && stackHasV4 |
| if alsoBindToV4 { |
| netProtos = append(netProtos, header.IPv4ProtocolNumber) |
| } |
| } |
| |
| var nic tcpip.NICID |
| // If an address is specified, we must ensure that it's one of our |
| // local addresses. |
| if len(addr.Addr) != 0 { |
| nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) |
| if nic == 0 { |
| return &tcpip.ErrBadLocalAddress{} |
| } |
| e.ID.LocalAddress = addr.Addr |
| } |
| |
| bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) |
| portRes := ports.Reservation{ |
| Networks: netProtos, |
| Transport: ProtocolNumber, |
| Addr: addr.Addr, |
| Port: addr.Port, |
| Flags: e.portFlags, |
| BindToDevice: bindToDevice, |
| Dest: tcpip.FullAddress{}, |
| } |
| port, err := e.stack.ReservePort(portRes, func(p uint16) (bool, tcpip.Error) { |
| id := e.ID |
| id.LocalPort = p |
| // CheckRegisterTransportEndpoint should only return an error if there is a |
| // listening endpoint bound with the same id and portFlags and bindToDevice |
| // options. |
| // |
| // NOTE: Only listening and connected endpoint register with |
| // demuxer. Further connected endpoints always have a remote |
| // address/port. Hence this will only return an error if there is a matching |
| // listening endpoint. |
| if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil { |
| return false, nil |
| } |
| return true, nil |
| }) |
| if err != nil { |
| return err |
| } |
| |
| e.boundBindToDevice = bindToDevice |
| e.boundPortFlags = e.portFlags |
| // TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct. |
| e.boundNICID = nic |
| e.isPortReserved = true |
| e.effectiveNetProtos = netProtos |
| e.ID.LocalPort = port |
| |
| // Mark endpoint as bound. |
| e.setEndpointState(StateBound) |
| |
| return nil |
| } |
| |
| // GetLocalAddress returns the address to which the endpoint is bound. |
| func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { |
| e.LockUser() |
| defer e.UnlockUser() |
| |
| return tcpip.FullAddress{ |
| Addr: e.ID.LocalAddress, |
| Port: e.ID.LocalPort, |
| NIC: e.boundNICID, |
| }, nil |
| } |
| |
| // GetRemoteAddress returns the address to which the endpoint is connected. |
| func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { |
| e.LockUser() |
| defer e.UnlockUser() |
| |
| if !e.EndpointState().connected() { |
| return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} |
| } |
| |
| return e.getRemoteAddress(), nil |
| } |
| |
| func (e *endpoint) getRemoteAddress() tcpip.FullAddress { |
| return tcpip.FullAddress{ |
| Addr: e.ID.RemoteAddress, |
| Port: e.ID.RemotePort, |
| NIC: e.boundNICID, |
| } |
| } |
| |
| func (*endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) { |
| // TCP HandlePacket is not required anymore as inbound packets first |
| // land at the Dispatcher which then can either deliver using the |
| // worker go routine or directly do the invoke the tcp processing inline |
| // based on the state of the endpoint. |
| } |
| |
| func (e *endpoint) enqueueSegment(s *segment) bool { |
| // Send packet to worker goroutine. |
| if !e.segmentQueue.enqueue(s) { |
| // The queue is full, so we drop the segment. |
| e.stack.Stats().DroppedPackets.Increment() |
| e.stats.ReceiveErrors.SegmentQueueDropped.Increment() |
| return false |
| } |
| return true |
| } |
| |
| func (e *endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt *stack.PacketBuffer) { |
| // Update last error first. |
| e.lastErrorMu.Lock() |
| e.lastError = err |
| e.lastErrorMu.Unlock() |
| |
| // Update the error queue if IP_RECVERR is enabled. |
| if e.SocketOptions().GetRecvError() { |
| e.SocketOptions().QueueErr(&tcpip.SockError{ |
| Err: err, |
| Cause: transErr, |
| // Linux passes the payload with the TCP header. We don't know if the TCP |
| // header even exists, it may not for fragmented packets. |
| Payload: pkt.Data().AsRange().ToOwnedView(), |
| Dst: tcpip.FullAddress{ |
| NIC: pkt.NICID, |
| Addr: e.ID.RemoteAddress, |
| Port: e.ID.RemotePort, |
| }, |
| Offender: tcpip.FullAddress{ |
| NIC: pkt.NICID, |
| Addr: e.ID.LocalAddress, |
| Port: e.ID.LocalPort, |
| }, |
| NetProto: pkt.NetworkProtocolNumber, |
| }) |
| } |
| |
| // Notify of the error. |
| e.notifyProtocolGoroutine(notifyError) |
| } |
| |
| // HandleError implements stack.TransportEndpoint. |
| func (e *endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketBuffer) { |
| handlePacketTooBig := func(mtu uint32) { |
| e.sndBufMu.Lock() |
| e.packetTooBigCount++ |
| if v := int(mtu); v < e.sndMTU { |
| e.sndMTU = v |
| } |
| e.sndBufMu.Unlock() |
| e.notifyProtocolGoroutine(notifyMTUChanged) |
| } |
| |
| // TODO(gvisor.dev/issues/5270): Handle all transport errors. |
| switch transErr.Kind() { |
| case stack.PacketTooBigTransportError: |
| handlePacketTooBig(transErr.Info()) |
| case stack.DestinationHostUnreachableTransportError: |
| e.onICMPError(&tcpip.ErrNoRoute{}, transErr, pkt) |
| case stack.DestinationNetworkUnreachableTransportError: |
| e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt) |
| } |
| } |
| |
| // updateSndBufferUsage is called by the protocol goroutine when room opens up |
| // in the send buffer. The number of newly available bytes is v. |
| func (e *endpoint) updateSndBufferUsage(v int) { |
| sendBufferSize := e.getSendBufferSize() |
| e.sndBufMu.Lock() |
| notify := e.sndBufUsed >= sendBufferSize>>1 |
| e.sndBufUsed -= v |
| // We only notify when there is half the sendBufferSize available after |
| // a full buffer event occurs. This ensures that we don't wake up |
| // writers to queue just 1-2 segments and go back to sleep. |
| notify = notify && e.sndBufUsed < sendBufferSize>>1 |
| e.sndBufMu.Unlock() |
| |
| if notify { |
| e.waiterQueue.Notify(waiter.WritableEvents) |
| } |
| } |
| |
| // readyToRead is called by the protocol goroutine when a new segment is ready |
| // to be read, or when the connection is closed for receiving (in which case |
| // s will be nil). |
| func (e *endpoint) readyToRead(s *segment) { |
| e.rcvListMu.Lock() |
| if s != nil { |
| e.rcvBufUsed += s.payloadSize() |
| s.incRef() |
| e.rcvList.PushBack(s) |
| } else { |
| e.rcvClosed = true |
| } |
| e.rcvListMu.Unlock() |
| e.waiterQueue.Notify(waiter.ReadableEvents) |
| } |
| |
| // receiveBufferAvailableLocked calculates how many bytes are still available |
| // in the receive buffer. |
| // rcvListMu must be held when this function is called. |
| func (e *endpoint) receiveBufferAvailableLocked() int { |
| // We may use more bytes than the buffer size when the receive buffer |
| // shrinks. |
| memUsed := e.receiveMemUsed() |
| if memUsed >= e.rcvBufSize { |
| return 0 |
| } |
| |
| return e.rcvBufSize - memUsed |
| } |
| |
| // receiveBufferAvailable calculates how many bytes are still available in the |
| // receive buffer based on the actual memory used by all segments held in |
| // receive buffer/pending and segment queue. |
| func (e *endpoint) receiveBufferAvailable() int { |
| e.rcvListMu.Lock() |
| available := e.receiveBufferAvailableLocked() |
| e.rcvListMu.Unlock() |
| return available |
| } |
| |
| // receiveBufferUsed returns the amount of in-use receive buffer. |
| func (e *endpoint) receiveBufferUsed() int { |
| e.rcvListMu.Lock() |
| used := e.rcvBufUsed |
| e.rcvListMu.Unlock() |
| return used |
| } |
| |
| // receiveBufferSize returns the current size of the receive buffer. |
| func (e *endpoint) receiveBufferSize() int { |
| e.rcvListMu.Lock() |
| size := e.rcvBufSize |
| e.rcvListMu.Unlock() |
| return size |
| } |
| |
| // receiveMemUsed returns the total memory in use by segments held by this |
| // endpoint. |
| func (e *endpoint) receiveMemUsed() int { |
| return int(atomic.LoadInt32(&e.rcvMemUsed)) |
| } |
| |
| // updateReceiveMemUsed adds the provided delta to e.rcvMemUsed. |
| func (e *endpoint) updateReceiveMemUsed(delta int) { |
| atomic.AddInt32(&e.rcvMemUsed, int32(delta)) |
| } |
| |
| // maxReceiveBufferSize returns the stack wide maximum receive buffer size for |
| // an endpoint. |
| func (e *endpoint) maxReceiveBufferSize() int { |
| var rs tcpip.TCPReceiveBufferSizeRangeOption |
| if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil { |
| // As a fallback return the hardcoded max buffer size. |
| return MaxBufferSize |
| } |
| return rs.Max |
| } |
| |
| // rcvWndScaleForHandshake computes the receive window scale to offer to the |
| // peer when window scaling is enabled (true by default). If auto-tuning is |
| // disabled then the window scaling factor is based on the size of the |
| // receiveBuffer otherwise we use the max permissible receive buffer size to |
| // compute the scale. |
| func (e *endpoint) rcvWndScaleForHandshake() int { |
| bufSizeForScale := e.receiveBufferSize() |
| |
| e.rcvListMu.Lock() |
| autoTuningDisabled := e.rcvAutoParams.disabled |
| e.rcvListMu.Unlock() |
| if autoTuningDisabled { |
| return FindWndScale(seqnum.Size(bufSizeForScale)) |
| } |
| |
| return FindWndScale(seqnum.Size(e.maxReceiveBufferSize())) |
| } |
| |
| // updateRecentTimestamp updates the recent timestamp using the algorithm |
| // described in https://tools.ietf.org/html/rfc7323#section-4.3 |
| func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) { |
| if e.sendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) { |
| e.setRecentTimestamp(tsVal) |
| } |
| } |
| |
| // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if |
| // the SYN options indicate that timestamp option was negotiated. It also |
| // initializes the recentTS with the value provided in synOpts.TSval. |
| func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) { |
| if synOpts.TS { |
| e.sendTSOk = true |
| e.setRecentTimestamp(synOpts.TSVal) |
| } |
| } |
| |
| // timestamp returns the timestamp value to be used in the TSVal field of the |
| // timestamp option for outgoing TCP segments for a given endpoint. |
| func (e *endpoint) timestamp() uint32 { |
| return tcpTimeStamp(time.Now(), e.tsOffset) |
| } |
| |
| // tcpTimeStamp returns a timestamp offset by the provided offset. This is |
| // not inlined above as it's used when SYN cookies are in use and endpoint |
| // is not created at the time when the SYN cookie is sent. |
| func tcpTimeStamp(curTime time.Time, offset uint32) uint32 { |
| return uint32(curTime.Unix()*1000+int64(curTime.Nanosecond()/1e6)) + offset |
| } |
| |
| // timeStampOffset returns a randomized timestamp offset to be used when sending |
| // timestamp values in a timestamp option for a TCP segment. |
| func timeStampOffset() uint32 { |
| b := make([]byte, 4) |
| if _, err := rand.Read(b); err != nil { |
| panic(err) |
| } |
| // Initialize a random tsOffset that will be added to the recentTS |
| // everytime the timestamp is sent when the Timestamp option is enabled. |
| // |
| // See https://tools.ietf.org/html/rfc7323#section-5.4 for details on |
| // why this is required. |
| // |
| // NOTE: This is not completely to spec as normally this should be |
| // initialized in a manner analogous to how sequence numbers are |
| // randomized per connection basis. But for now this is sufficient. |
| return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 |
| } |
| |
| // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint |
| // if the SYN options indicate that the SACK option was negotiated and the TCP |
| // stack is configured to enable TCP SACK option. |
| func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) { |
| var v tcpip.TCPSACKEnabled |
| if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil { |
| // Stack doesn't support SACK. So just return. |
| return |
| } |
| if bool(v) && synOpts.SACKPermitted { |
| e.sackPermitted = true |
| } |
| } |
| |
| // maxOptionSize return the maximum size of TCP options. |
| func (e *endpoint) maxOptionSize() (size int) { |
| var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock |
| options := e.makeOptions(maxSackBlocks[:]) |
| size = len(options) |
| putOptions(options) |
| |
| return size |
| } |
| |
| // completeState makes a full copy of the endpoint and returns it. This is used |
| // before invoking the probe. The state returned may not be fully consistent if |
| // there are intervening syscalls when the state is being copied. |
| func (e *endpoint) completeState() stack.TCPEndpointState { |
| var s stack.TCPEndpointState |
| s.SegTime = time.Now() |
| |
| // Copy EndpointID. |
| s.ID = stack.TCPEndpointID(e.ID) |
| |
| // Copy endpoint rcv state. |
| e.rcvListMu.Lock() |
| s.RcvBufSize = e.rcvBufSize |
| s.RcvBufUsed = e.rcvBufUsed |
| s.RcvClosed = e.rcvClosed |
| s.RcvAutoParams.MeasureTime = e.rcvAutoParams.measureTime |
| s.RcvAutoParams.CopiedBytes = e.rcvAutoParams.copied |
| s.RcvAutoParams.PrevCopiedBytes = e.rcvAutoParams.prevCopied |
| s.RcvAutoParams.RTT = e.rcvAutoParams.rtt |
| s.RcvAutoParams.RTTMeasureSeqNumber = e.rcvAutoParams.rttMeasureSeqNumber |
| s.RcvAutoParams.RTTMeasureTime = e.rcvAutoParams.rttMeasureTime |
| s.RcvAutoParams.Disabled = e.rcvAutoParams.disabled |
| e.rcvListMu.Unlock() |
| |
| // Endpoint TCP Option state. |
| s.SendTSOk = e.sendTSOk |
| s.RecentTS = e.recentTimestamp() |
| s.TSOffset = e.tsOffset |
| s.SACKPermitted = e.sackPermitted |
| s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks) |
| copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks]) |
| s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy() |
| |
| // Copy endpoint send state. |
| sndBufSize := e.getSendBufferSize() |
| e.sndBufMu.Lock() |
| s.SndBufSize = sndBufSize |
| s.SndBufUsed = e.sndBufUsed |
| s.SndClosed = e.sndClosed |
| s.SndBufInQueue = e.sndBufInQueue |
| s.PacketTooBigCount = e.packetTooBigCount |
| s.SndMTU = e.sndMTU |
| e.sndBufMu.Unlock() |
| |
| // Copy receiver state. |
| s.Receiver = stack.TCPReceiverState{ |
| RcvNxt: e.rcv.rcvNxt, |
| RcvAcc: e.rcv.rcvAcc, |
| RcvWndScale: e.rcv.rcvWndScale, |
| PendingBufUsed: e.rcv.pendingBufUsed, |
| } |
| |
| // Copy sender state. |
| s.Sender = stack.TCPSenderState{ |
| LastSendTime: e.snd.lastSendTime, |
| DupAckCount: e.snd.dupAckCount, |
| FastRecovery: stack.TCPFastRecoveryState{ |
| Active: e.snd.fr.active, |
| First: e.snd.fr.first, |
| Last: e.snd.fr.last, |
| MaxCwnd: e.snd.fr.maxCwnd, |
| HighRxt: e.snd.fr.highRxt, |
| RescueRxt: e.snd.fr.rescueRxt, |
| }, |
| SndCwnd: e.snd.sndCwnd, |
| Ssthresh: e.snd.sndSsthresh, |
| SndCAAckCount: e.snd.sndCAAckCount, |
| Outstanding: e.snd.outstanding, |
| SackedOut: e.snd.sackedOut, |
| SndWnd: e.snd.sndWnd, |
| SndUna: e.snd.sndUna, |
| SndNxt: e.snd.sndNxt, |
| RTTMeasureSeqNum: e.snd.rttMeasureSeqNum, |
| RTTMeasureTime: e.snd.rttMeasureTime, |
| Closed: e.snd.closed, |
| RTO: e.snd.rto, |
| MaxPayloadSize: e.snd.maxPayloadSize, |
| SndWndScale: e.snd.sndWndScale, |
| MaxSentAck: e.snd.maxSentAck, |
| } |
| e.snd.rtt.Lock() |
| s.Sender.SRTT = e.snd.rtt.srtt |
| s.Sender.SRTTInited = e.snd.rtt.srttInited |
| e.snd.rtt.Unlock() |
| |
| if cubic, ok := e.snd.cc.(*cubicState); ok { |
| s.Sender.Cubic = stack.TCPCubicState{ |
| WMax: cubic.wMax, |
| WLastMax: cubic.wLastMax, |
| T: cubic.t, |
| TimeSinceLastCongestion: time.Since(cubic.t), |
| C: cubic.c, |
| K: cubic.k, |
| Beta: cubic.beta, |
| WC: cubic.wC, |
| WEst: cubic.wEst, |
| } |
| } |
| |
| rc := &e.snd.rc |
| s.Sender.RACKState = stack.TCPRACKState{ |
| XmitTime: rc.xmitTime, |
| EndSequence: rc.endSequence, |
| FACK: rc.fack, |
| RTT: rc.rtt, |
| Reord: rc.reorderSeen, |
| DSACKSeen: rc.dsackSeen, |
| ReoWnd: rc.reoWnd, |
| ReoWndIncr: rc.reoWndIncr, |
| ReoWndPersist: rc.reoWndPersist, |
| RTTSeq: rc.rttSeq, |
| } |
| return s |
| } |
| |
| func (e *endpoint) initHardwareGSO() { |
| gso := &stack.GSO{} |
| switch e.route.NetProto() { |
| case header.IPv4ProtocolNumber: |
| gso.Type = stack.GSOTCPv4 |
| gso.L3HdrLen = header.IPv4MinimumSize |
| case header.IPv6ProtocolNumber: |
| gso.Type = stack.GSOTCPv6 |
| gso.L3HdrLen = header.IPv6MinimumSize |
| default: |
| panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto)) |
| } |
| gso.NeedsCsum = true |
| gso.CsumOffset = header.TCPChecksumOffset |
| gso.MaxSize = e.route.GSOMaxSize() |
| e.gso = gso |
| } |
| |
| func (e *endpoint) initGSO() { |
| if e.route.HasHardwareGSOCapability() { |
| e.initHardwareGSO() |
| } else if e.route.HasSoftwareGSOCapability() { |
| e.gso = &stack.GSO{ |
| MaxSize: e.route.GSOMaxSize(), |
| Type: stack.GSOSW, |
| NeedsCsum: false, |
| } |
| } |
| } |
| |
| // State implements tcpip.Endpoint.State. It exports the endpoint's protocol |
| // state for diagnostics. |
| func (e *endpoint) State() uint32 { |
| return uint32(e.EndpointState()) |
| } |
| |
| // Info returns a copy of the endpoint info. |
| func (e *endpoint) Info() tcpip.EndpointInfo { |
| e.LockUser() |
| // Make a copy of the endpoint info. |
| ret := e.TransportEndpointInfo |
| e.UnlockUser() |
| return &ret |
| } |
| |
| // Stats returns a pointer to the endpoint stats. |
| func (e *endpoint) Stats() tcpip.EndpointStats { |
| return &e.stats |
| } |
| |
| // Wait implements stack.TransportEndpoint.Wait. |
| func (e *endpoint) Wait() { |
| waitEntry, notifyCh := waiter.NewChannelEntry(nil) |
| e.waiterQueue.EventRegister(&waitEntry, waiter.EventHUp) |
| defer e.waiterQueue.EventUnregister(&waitEntry) |
| for { |
| e.LockUser() |
| running := e.workerRunning |
| e.UnlockUser() |
| if !running { |
| break |
| } |
| <-notifyCh |
| } |
| } |
| |
| // SocketOptions implements tcpip.Endpoint.SocketOptions. |
| func (e *endpoint) SocketOptions() *tcpip.SocketOptions { |
| return &e.ops |
| } |
| |
| // GetTCPSendBufferLimits is used to get send buffer size limits for TCP. |
| func GetTCPSendBufferLimits(s tcpip.StackHandler) tcpip.SendBufferSizeOption { |
| var ss tcpip.TCPSendBufferSizeRangeOption |
| if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil { |
| panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err)) |
| } |
| |
| return tcpip.SendBufferSizeOption{ |
| Min: ss.Min, |
| Default: ss.Default, |
| Max: ss.Max, |
| } |
| } |
| |
| // allowOutOfWindowAck returns true if an out-of-window ACK can be sent now. |
| func (e *endpoint) allowOutOfWindowAck() bool { |
| var limit stack.TCPInvalidRateLimitOption |
| if err := e.stack.Option(&limit); err != nil { |
| panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err)) |
| } |
| |
| now := time.Now() |
| if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) { |
| return false |
| } |
| |
| e.lastOutOfWindowAckTime = now |
| return true |
| } |