| // Copyright 2018 The gVisor Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package tcp |
| |
| import ( |
| "encoding/binary" |
| "fmt" |
| "math" |
| "strings" |
| "sync" |
| "sync/atomic" |
| "time" |
| |
| "github.com/google/netstack/rand" |
| "github.com/google/netstack/sleep" |
| "github.com/google/netstack/tcpip" |
| "github.com/google/netstack/tcpip/buffer" |
| "github.com/google/netstack/tcpip/hash/jenkins" |
| "github.com/google/netstack/tcpip/header" |
| "github.com/google/netstack/tcpip/iptables" |
| "github.com/google/netstack/tcpip/seqnum" |
| "github.com/google/netstack/tcpip/stack" |
| "github.com/google/netstack/tmutex" |
| "github.com/google/netstack/waiter" |
| ) |
| |
| // EndpointState represents the state of a TCP endpoint. |
| type EndpointState uint32 |
| |
| // Endpoint states. Note that are represented in a netstack-specific manner and |
| // may not be meaningful externally. Specifically, they need to be translated to |
| // Linux's representation for these states if presented to userspace. |
| const ( |
| // Endpoint states internal to netstack. These map to the TCP state CLOSED. |
| StateInitial EndpointState = iota |
| StateBound |
| StateConnecting // Connect() called, but the initial SYN hasn't been sent. |
| StateError |
| |
| // TCP protocol states. |
| StateEstablished |
| StateSynSent |
| StateSynRecv |
| StateFinWait1 |
| StateFinWait2 |
| StateTimeWait |
| StateClose |
| StateCloseWait |
| StateLastAck |
| StateListen |
| StateClosing |
| ) |
| |
| // connected is the set of states where an endpoint is connected to a peer. |
| func (s EndpointState) connected() bool { |
| switch s { |
| case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing: |
| return true |
| default: |
| return false |
| } |
| } |
| |
| // String implements fmt.Stringer.String. |
| func (s EndpointState) String() string { |
| switch s { |
| case StateInitial: |
| return "INITIAL" |
| case StateBound: |
| return "BOUND" |
| case StateConnecting: |
| return "CONNECTING" |
| case StateError: |
| return "ERROR" |
| case StateEstablished: |
| return "ESTABLISHED" |
| case StateSynSent: |
| return "SYN-SENT" |
| case StateSynRecv: |
| return "SYN-RCVD" |
| case StateFinWait1: |
| return "FIN-WAIT1" |
| case StateFinWait2: |
| return "FIN-WAIT2" |
| case StateTimeWait: |
| return "TIME-WAIT" |
| case StateClose: |
| return "CLOSED" |
| case StateCloseWait: |
| return "CLOSE-WAIT" |
| case StateLastAck: |
| return "LAST-ACK" |
| case StateListen: |
| return "LISTEN" |
| case StateClosing: |
| return "CLOSING" |
| default: |
| panic("unreachable") |
| } |
| } |
| |
| // Reasons for notifying the protocol goroutine. |
| const ( |
| notifyNonZeroReceiveWindow = 1 << iota |
| notifyReceiveWindowChanged |
| notifyClose |
| notifyMTUChanged |
| notifyDrain |
| notifyReset |
| notifyKeepaliveChanged |
| notifyMSSChanged |
| ) |
| |
| // SACKInfo holds TCP SACK related information for a given endpoint. |
| // |
| // +stateify savable |
| type SACKInfo struct { |
| // Blocks is the maximum number of SACK blocks we track |
| // per endpoint. |
| Blocks [MaxSACKBlocks]header.SACKBlock |
| |
| // NumBlocks is the number of valid SACK blocks stored in the |
| // blocks array above. |
| NumBlocks int |
| } |
| |
| // rcvBufAutoTuneParams are used to hold state variables to compute |
| // the auto tuned recv buffer size. |
| // |
| // +stateify savable |
| type rcvBufAutoTuneParams struct { |
| // measureTime is the time at which the current measurement |
| // was started. |
| measureTime time.Time |
| |
| // copied is the number of bytes copied out of the receive |
| // buffers since this measure began. |
| copied int |
| |
| // prevCopied is the number of bytes copied out of the receive |
| // buffers in the previous RTT period. |
| prevCopied int |
| |
| // rtt is the non-smoothed minimum RTT as measured by observing the time |
| // between when a byte is first acknowledged and the receipt of data |
| // that is at least one window beyond the sequence number that was |
| // acknowledged. |
| rtt time.Duration |
| |
| // rttMeasureSeqNumber is the highest acceptable sequence number at the |
| // time this RTT measurement period began. |
| rttMeasureSeqNumber seqnum.Value |
| |
| // rttMeasureTime is the absolute time at which the current rtt |
| // measurement period began. |
| rttMeasureTime time.Time |
| |
| // disabled is true if an explicit receive buffer is set for the |
| // endpoint. |
| disabled bool |
| } |
| |
| // ReceiveErrors collect segment receive errors within transport layer. |
| type ReceiveErrors struct { |
| tcpip.ReceiveErrors |
| |
| // SegmentQueueDropped is the number of segments dropped due to |
| // a full segment queue. |
| SegmentQueueDropped tcpip.StatCounter |
| |
| // ChecksumErrors is the number of segments dropped due to bad checksums. |
| ChecksumErrors tcpip.StatCounter |
| |
| // ListenOverflowSynDrop is the number of times the listen queue overflowed |
| // and a SYN was dropped. |
| ListenOverflowSynDrop tcpip.StatCounter |
| |
| // ListenOverflowAckDrop is the number of times the final ACK |
| // in the handshake was dropped due to overflow. |
| ListenOverflowAckDrop tcpip.StatCounter |
| |
| // ZeroRcvWindowState is the number of times we advertised |
| // a zero receive window when rcvList is full. |
| ZeroRcvWindowState tcpip.StatCounter |
| } |
| |
| // SendErrors collect segment send errors within the transport layer. |
| type SendErrors struct { |
| tcpip.SendErrors |
| |
| // SegmentSendToNetworkFailed is the number of TCP segments failed to be sent |
| // to the network endpoint. |
| SegmentSendToNetworkFailed tcpip.StatCounter |
| |
| // SynSendToNetworkFailed is the number of TCP SYNs failed to be sent |
| // to the network endpoint. |
| SynSendToNetworkFailed tcpip.StatCounter |
| |
| // Retransmits is the number of TCP segments retransmitted. |
| Retransmits tcpip.StatCounter |
| |
| // FastRetransmit is the number of segments retransmitted in fast |
| // recovery. |
| FastRetransmit tcpip.StatCounter |
| |
| // Timeouts is the number of times the RTO expired. |
| Timeouts tcpip.StatCounter |
| } |
| |
| // Stats holds statistics about the endpoint. |
| type Stats struct { |
| // SegmentsReceived is the number of TCP segments received that |
| // the transport layer successfully parsed. |
| SegmentsReceived tcpip.StatCounter |
| |
| // SegmentsSent is the number of TCP segments sent. |
| SegmentsSent tcpip.StatCounter |
| |
| // FailedConnectionAttempts is the number of times we saw Connect and |
| // Accept errors. |
| FailedConnectionAttempts tcpip.StatCounter |
| |
| // ReceiveErrors collects segment receive errors within the |
| // transport layer. |
| ReceiveErrors ReceiveErrors |
| |
| // ReadErrors collects segment read errors from an endpoint read call. |
| ReadErrors tcpip.ReadErrors |
| |
| // SendErrors collects segment send errors within the transport layer. |
| SendErrors SendErrors |
| |
| // WriteErrors collects segment write errors from an endpoint write call. |
| WriteErrors tcpip.WriteErrors |
| } |
| |
| // IsEndpointStats is an empty method to implement the tcpip.EndpointStats |
| // marker interface. |
| func (*Stats) IsEndpointStats() {} |
| |
| // EndpointInfo holds useful information about a transport endpoint which |
| // can be queried by monitoring tools. |
| // |
| // +stateify savable |
| type EndpointInfo struct { |
| stack.TransportEndpointInfo |
| |
| // HardError is meaningful only when state is stateError. It stores the |
| // error to be returned when read/write syscalls are called and the |
| // endpoint is in this state. HardError is protected by endpoint mu. |
| HardError *tcpip.Error |
| } |
| |
| // IsEndpointInfo is an empty method to implement the tcpip.EndpointInfo |
| // marker interface. |
| func (*EndpointInfo) IsEndpointInfo() {} |
| |
| // endpoint represents a TCP endpoint. This struct serves as the interface |
| // between users of the endpoint and the protocol implementation; it is legal to |
| // have concurrent goroutines make calls into the endpoint, they are properly |
| // synchronized. The protocol implementation, however, runs in a single |
| // goroutine. |
| // |
| // +stateify savable |
| type endpoint struct { |
| EndpointInfo |
| |
| // workMu is used to arbitrate which goroutine may perform protocol |
| // work. Only the main protocol goroutine is expected to call Lock() on |
| // it, but other goroutines (e.g., send) may call TryLock() to eagerly |
| // perform work without having to wait for the main one to wake up. |
| workMu tmutex.Mutex |
| |
| // The following fields are initialized at creation time and do not |
| // change throughout the lifetime of the endpoint. |
| stack *stack.Stack |
| waiterQueue *waiter.Queue |
| |
| // lastError represents the last error that the endpoint reported; |
| // access to it is protected by the following mutex. |
| lastErrorMu sync.Mutex |
| lastError *tcpip.Error |
| |
| // The following fields are used to manage the receive queue. The |
| // protocol goroutine adds ready-for-delivery segments to rcvList, |
| // which are returned by Read() calls to users. |
| // |
| // Once the peer has closed its send side, rcvClosed is set to true |
| // to indicate to users that no more data is coming. |
| // |
| // rcvListMu can be taken after the endpoint mu below. |
| rcvListMu sync.Mutex |
| rcvList segmentList |
| rcvClosed bool |
| rcvBufSize int |
| rcvBufUsed int |
| rcvAutoParams rcvBufAutoTuneParams |
| // zeroWindow indicates that the window was closed due to receive buffer |
| // space being filled up. This is set by the worker goroutine before |
| // moving a segment to the rcvList. This setting is cleared by the |
| // endpoint when a Read() call reads enough data for the new window to |
| // be non-zero. |
| zeroWindow bool |
| |
| // The following fields are protected by the mutex. |
| mu sync.RWMutex |
| |
| state EndpointState |
| |
| isPortReserved bool |
| isRegistered bool |
| boundNICID tcpip.NICID |
| route stack.Route |
| ttl uint8 |
| v6only bool |
| isConnectNotified bool |
| // TCP should never broadcast but Linux nevertheless supports enabling/ |
| // disabling SO_BROADCAST, albeit as a NOOP. |
| broadcast bool |
| |
| // effectiveNetProtos contains the network protocols actually in use. In |
| // most cases it will only contain "netProto", but in cases like IPv6 |
| // endpoints with v6only set to false, this could include multiple |
| // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g., |
| // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped |
| // address). |
| effectiveNetProtos []tcpip.NetworkProtocolNumber |
| |
| // workerRunning specifies if a worker goroutine is running. |
| workerRunning bool |
| |
| // workerCleanup specifies if the worker goroutine must perform cleanup |
| // before exitting. This can only be set to true when workerRunning is |
| // also true, and they're both protected by the mutex. |
| workerCleanup bool |
| |
| // sendTSOk is used to indicate when the TS Option has been negotiated. |
| // When sendTSOk is true every non-RST segment should carry a TS as per |
| // RFC7323#section-1.1 |
| sendTSOk bool |
| |
| // recentTS is the timestamp that should be sent in the TSEcr field of |
| // the timestamp for future segments sent by the endpoint. This field is |
| // updated if required when a new segment is received by this endpoint. |
| recentTS uint32 |
| |
| // tsOffset is a randomized offset added to the value of the |
| // TSVal field in the timestamp option. |
| tsOffset uint32 |
| |
| // shutdownFlags represent the current shutdown state of the endpoint. |
| shutdownFlags tcpip.ShutdownFlags |
| |
| // sackPermitted is set to true if the peer sends the TCPSACKPermitted |
| // option in the SYN/SYN-ACK. |
| sackPermitted bool |
| |
| // sack holds TCP SACK related information for this endpoint. |
| sack SACKInfo |
| |
| // reusePort is set to true if SO_REUSEPORT is enabled. |
| reusePort bool |
| |
| // bindToDevice is set to the NIC on which to bind or disabled if 0. |
| bindToDevice tcpip.NICID |
| |
| // delay enables Nagle's algorithm. |
| // |
| // delay is a boolean (0 is false) and must be accessed atomically. |
| delay uint32 |
| |
| // cork holds back segments until full. |
| // |
| // cork is a boolean (0 is false) and must be accessed atomically. |
| cork uint32 |
| |
| // scoreboard holds TCP SACK Scoreboard information for this endpoint. |
| scoreboard *SACKScoreboard |
| |
| // The options below aren't implemented, but we remember the user |
| // settings because applications expect to be able to set/query these |
| // options. |
| reuseAddr bool |
| |
| // slowAck holds the negated state of quick ack. It is stubbed out and |
| // does nothing. |
| // |
| // slowAck is a boolean (0 is false) and must be accessed atomically. |
| slowAck uint32 |
| |
| // segmentQueue is used to hand received segments to the protocol |
| // goroutine. Segments are queued as long as the queue is not full, |
| // and dropped when it is. |
| segmentQueue segmentQueue |
| |
| // synRcvdCount is the number of connections for this endpoint that are |
| // in SYN-RCVD state. |
| synRcvdCount int |
| |
| // userMSS if non-zero is the MSS value explicitly set by the user |
| // for this endpoint using the TCP_MAXSEG setsockopt. |
| userMSS int |
| |
| // The following fields are used to manage the send buffer. When |
| // segments are ready to be sent, they are added to sndQueue and the |
| // protocol goroutine is signaled via sndWaker. |
| // |
| // When the send side is closed, the protocol goroutine is notified via |
| // sndCloseWaker, and sndClosed is set to true. |
| sndBufMu sync.Mutex |
| sndBufSize int |
| sndBufUsed int |
| sndClosed bool |
| sndBufInQueue seqnum.Size |
| sndQueue segmentList |
| sndWaker sleep.Waker |
| sndCloseWaker sleep.Waker |
| |
| // cc stores the name of the Congestion Control algorithm to use for |
| // this endpoint. |
| cc tcpip.CongestionControlOption |
| |
| // The following are used when a "packet too big" control packet is |
| // received. They are protected by sndBufMu. They are used to |
| // communicate to the main protocol goroutine how many such control |
| // messages have been received since the last notification was processed |
| // and what was the smallest MTU seen. |
| packetTooBigCount int |
| sndMTU int |
| |
| // newSegmentWaker is used to indicate to the protocol goroutine that |
| // it needs to wake up and handle new segments queued to it. |
| newSegmentWaker sleep.Waker |
| |
| // notificationWaker is used to indicate to the protocol goroutine that |
| // it needs to wake up and check for notifications. |
| notificationWaker sleep.Waker |
| |
| // notifyFlags is a bitmask of flags used to indicate to the protocol |
| // goroutine what it was notified; this is only accessed atomically. |
| notifyFlags uint32 |
| |
| // keepalive manages TCP keepalive state. When the connection is idle |
| // (no data sent or received) for keepaliveIdle, we start sending |
| // keepalives every keepalive.interval. If we send keepalive.count |
| // without hearing a response, the connection is closed. |
| keepalive keepalive |
| |
| // pendingAccepted is a synchronization primitive used to track number |
| // of connections that are queued up to be delivered to the accepted |
| // channel. We use this to ensure that all goroutines blocked on writing |
| // to the acceptedChan below terminate before we close acceptedChan. |
| pendingAccepted sync.WaitGroup |
| |
| // acceptedChan is used by a listening endpoint protocol goroutine to |
| // send newly accepted connections to the endpoint so that they can be |
| // read by Accept() calls. |
| acceptedChan chan *endpoint |
| |
| // The following are only used from the protocol goroutine, and |
| // therefore don't need locks to protect them. |
| rcv *receiver |
| snd *sender |
| |
| // The goroutine drain completion notification channel. |
| drainDone chan struct{} |
| |
| // The goroutine undrain notification channel. This is currently used as |
| // a way to block the worker goroutines. Today nothing closes/writes |
| // this channel and this causes any goroutines waiting on this to just |
| // block. This is used during save/restore to prevent worker goroutines |
| // from mutating state as it's being saved. |
| undrain chan struct{} |
| |
| // probe if not nil is invoked on every received segment. It is passed |
| // a copy of the current state of the endpoint. |
| probe stack.TCPProbeFunc |
| |
| // The following are only used to assist the restore run to re-connect. |
| connectingAddress tcpip.Address |
| |
| // amss is the advertised MSS to the peer by this endpoint. |
| amss uint16 |
| |
| // sendTOS represents IPv4 TOS or IPv6 TrafficClass, |
| // applied while sending packets. Defaults to 0 as on Linux. |
| sendTOS uint8 |
| |
| gso *stack.GSO |
| |
| // TODO(b/142022063): Add ability to save and restore per endpoint stats. |
| stats Stats |
| } |
| |
| // StopWork halts packet processing. Only to be used in tests. |
| func (e *endpoint) StopWork() { |
| e.workMu.Lock() |
| } |
| |
| // ResumeWork resumes packet processing. Only to be used in tests. |
| func (e *endpoint) ResumeWork() { |
| e.workMu.Unlock() |
| } |
| |
| // keepalive is a synchronization wrapper used to appease stateify. See the |
| // comment in endpoint, where it is used. |
| // |
| // +stateify savable |
| type keepalive struct { |
| sync.Mutex |
| enabled bool |
| idle time.Duration |
| interval time.Duration |
| count int |
| unacked int |
| timer timer |
| waker sleep.Waker |
| } |
| |
| func newEndpoint(s *stack.Stack, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint { |
| e := &endpoint{ |
| stack: s, |
| EndpointInfo: EndpointInfo{ |
| TransportEndpointInfo: stack.TransportEndpointInfo{ |
| NetProto: netProto, |
| TransProto: header.TCPProtocolNumber, |
| }, |
| }, |
| waiterQueue: waiterQueue, |
| state: StateInitial, |
| rcvBufSize: DefaultReceiveBufferSize, |
| sndBufSize: DefaultSendBufferSize, |
| sndMTU: int(math.MaxInt32), |
| reuseAddr: true, |
| keepalive: keepalive{ |
| // Linux defaults. |
| idle: 2 * time.Hour, |
| interval: 75 * time.Second, |
| count: 9, |
| }, |
| } |
| |
| var ss SendBufferSizeOption |
| if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil { |
| e.sndBufSize = ss.Default |
| } |
| |
| var rs ReceiveBufferSizeOption |
| if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil { |
| e.rcvBufSize = rs.Default |
| } |
| |
| var cs tcpip.CongestionControlOption |
| if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil { |
| e.cc = cs |
| } |
| |
| var mrb tcpip.ModerateReceiveBufferOption |
| if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil { |
| e.rcvAutoParams.disabled = !bool(mrb) |
| } |
| |
| if p := s.GetTCPProbe(); p != nil { |
| e.probe = p |
| } |
| |
| e.segmentQueue.setLimit(MaxUnprocessedSegments) |
| e.workMu.Init() |
| e.workMu.Lock() |
| e.tsOffset = timeStampOffset() |
| |
| return e |
| } |
| |
| // Readiness returns the current readiness of the endpoint. For example, if |
| // waiter.EventIn is set, the endpoint is immediately readable. |
| func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { |
| result := waiter.EventMask(0) |
| |
| e.mu.RLock() |
| defer e.mu.RUnlock() |
| |
| switch e.state { |
| case StateInitial, StateBound, StateConnecting, StateSynSent, StateSynRecv: |
| // Ready for nothing. |
| |
| case StateClose, StateError: |
| // Ready for anything. |
| result = mask |
| |
| case StateListen: |
| // Check if there's anything in the accepted channel. |
| if (mask & waiter.EventIn) != 0 { |
| if len(e.acceptedChan) > 0 { |
| result |= waiter.EventIn |
| } |
| } |
| } |
| if e.state.connected() { |
| // Determine if the endpoint is writable if requested. |
| if (mask & waiter.EventOut) != 0 { |
| e.sndBufMu.Lock() |
| if e.sndClosed || e.sndBufUsed < e.sndBufSize { |
| result |= waiter.EventOut |
| } |
| e.sndBufMu.Unlock() |
| } |
| |
| // Determine if the endpoint is readable if requested. |
| if (mask & waiter.EventIn) != 0 { |
| e.rcvListMu.Lock() |
| if e.rcvBufUsed > 0 || e.rcvClosed { |
| result |= waiter.EventIn |
| } |
| e.rcvListMu.Unlock() |
| } |
| } |
| |
| return result |
| } |
| |
| func (e *endpoint) fetchNotifications() uint32 { |
| return atomic.SwapUint32(&e.notifyFlags, 0) |
| } |
| |
| func (e *endpoint) notifyProtocolGoroutine(n uint32) { |
| for { |
| v := atomic.LoadUint32(&e.notifyFlags) |
| if v&n == n { |
| // The flags are already set. |
| return |
| } |
| |
| if atomic.CompareAndSwapUint32(&e.notifyFlags, v, v|n) { |
| if v == 0 { |
| // We are causing a transition from no flags to |
| // at least one flag set, so we must cause the |
| // protocol goroutine to wake up. |
| e.notificationWaker.Assert() |
| } |
| return |
| } |
| } |
| } |
| |
| // Close puts the endpoint in a closed state and frees all resources associated |
| // with it. It must be called only once and with no other concurrent calls to |
| // the endpoint. |
| func (e *endpoint) Close() { |
| // Issue a shutdown so that the peer knows we won't send any more data |
| // if we're connected, or stop accepting if we're listening. |
| e.Shutdown(tcpip.ShutdownWrite | tcpip.ShutdownRead) |
| |
| e.mu.Lock() |
| |
| // For listening sockets, we always release ports inline so that they |
| // are immediately available for reuse after Close() is called. If also |
| // registered, we unregister as well otherwise the next user would fail |
| // in Listen() when trying to register. |
| if e.state == StateListen && e.isPortReserved { |
| if e.isRegistered { |
| e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice) |
| e.isRegistered = false |
| } |
| |
| e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.bindToDevice) |
| e.isPortReserved = false |
| } |
| |
| // Either perform the local cleanup or kick the worker to make sure it |
| // knows it needs to cleanup. |
| tcpip.AddDanglingEndpoint(e) |
| if !e.workerRunning { |
| e.cleanupLocked() |
| } else { |
| e.workerCleanup = true |
| e.notifyProtocolGoroutine(notifyClose) |
| } |
| |
| e.mu.Unlock() |
| } |
| |
| // closePendingAcceptableConnections closes all connections that have completed |
| // handshake but not yet been delivered to the application. |
| func (e *endpoint) closePendingAcceptableConnectionsLocked() { |
| done := make(chan struct{}) |
| // Spin a goroutine up as ranging on e.acceptedChan will just block when |
| // there are no more connections in the channel. Using a non-blocking |
| // select does not work as it can potentially select the default case |
| // even when there are pending writes but that are not yet written to |
| // the channel. |
| go func() { |
| defer close(done) |
| for n := range e.acceptedChan { |
| n.mu.Lock() |
| n.resetConnectionLocked(tcpip.ErrConnectionAborted) |
| n.mu.Unlock() |
| n.Close() |
| } |
| }() |
| // pendingAccepted(see endpoint.deliverAccepted) tracks the number of |
| // endpoints which have completed handshake but are not yet written to |
| // the e.acceptedChan. We wait here till the goroutine above can drain |
| // all such connections from e.acceptedChan. |
| e.pendingAccepted.Wait() |
| close(e.acceptedChan) |
| <-done |
| e.acceptedChan = nil |
| } |
| |
| // cleanupLocked frees all resources associated with the endpoint. It is called |
| // after Close() is called and the worker goroutine (if any) is done with its |
| // work. |
| func (e *endpoint) cleanupLocked() { |
| // Close all endpoints that might have been accepted by TCP but not by |
| // the client. |
| if e.acceptedChan != nil { |
| e.closePendingAcceptableConnectionsLocked() |
| } |
| e.workerCleanup = false |
| |
| if e.isRegistered { |
| e.stack.UnregisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.bindToDevice) |
| e.isRegistered = false |
| } |
| |
| if e.isPortReserved { |
| e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, e.ID.LocalAddress, e.ID.LocalPort, e.bindToDevice) |
| e.isPortReserved = false |
| } |
| |
| e.route.Release() |
| tcpip.DeleteDanglingEndpoint(e) |
| } |
| |
| // initialReceiveWindow returns the initial receive window to advertise in the |
| // SYN/SYN-ACK. |
| func (e *endpoint) initialReceiveWindow() int { |
| rcvWnd := e.receiveBufferAvailable() |
| if rcvWnd > math.MaxUint16 { |
| rcvWnd = math.MaxUint16 |
| } |
| routeWnd := InitialCwnd * int(mssForRoute(&e.route)) * 2 |
| if rcvWnd > routeWnd { |
| rcvWnd = routeWnd |
| } |
| return rcvWnd |
| } |
| |
| // ModerateRecvBuf adjusts the receive buffer and the advertised window |
| // based on the number of bytes copied to user space. |
| func (e *endpoint) ModerateRecvBuf(copied int) { |
| e.rcvListMu.Lock() |
| if e.rcvAutoParams.disabled { |
| e.rcvListMu.Unlock() |
| return |
| } |
| now := time.Now() |
| if rtt := e.rcvAutoParams.rtt; rtt == 0 || now.Sub(e.rcvAutoParams.measureTime) < rtt { |
| e.rcvAutoParams.copied += copied |
| e.rcvListMu.Unlock() |
| return |
| } |
| prevRTTCopied := e.rcvAutoParams.copied + copied |
| prevCopied := e.rcvAutoParams.prevCopied |
| rcvWnd := 0 |
| if prevRTTCopied > prevCopied { |
| // The minimal receive window based on what was copied by the app |
| // in the immediate preceding RTT and some extra buffer for 16 |
| // segments to account for variations. |
| // We multiply by 2 to account for packet losses. |
| rcvWnd = prevRTTCopied*2 + 16*int(e.amss) |
| |
| // Scale for slow start based on bytes copied in this RTT vs previous. |
| grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied |
| |
| // Multiply growth factor by 2 again to account for sender being |
| // in slow-start where the sender grows it's congestion window |
| // by 100% per RTT. |
| rcvWnd += grow * 2 |
| |
| // Make sure auto tuned buffer size can always receive upto 2x |
| // the initial window of 10 segments. |
| if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd { |
| rcvWnd = minRcvWnd |
| } |
| |
| // Cap the auto tuned buffer size by the maximum permissible |
| // receive buffer size. |
| if max := e.maxReceiveBufferSize(); rcvWnd > max { |
| rcvWnd = max |
| } |
| |
| // We do not adjust downwards as that can cause the receiver to |
| // reject valid data that might already be in flight as the |
| // acceptable window will shrink. |
| if rcvWnd > e.rcvBufSize { |
| e.rcvBufSize = rcvWnd |
| e.notifyProtocolGoroutine(notifyReceiveWindowChanged) |
| } |
| |
| // We only update prevCopied when we grow the buffer because in cases |
| // where prevCopied > prevRTTCopied the existing buffer is already big |
| // enough to handle the current rate and we don't need to do any |
| // adjustments. |
| e.rcvAutoParams.prevCopied = prevRTTCopied |
| } |
| e.rcvAutoParams.measureTime = now |
| e.rcvAutoParams.copied = 0 |
| e.rcvListMu.Unlock() |
| } |
| |
| // IPTables implements tcpip.Endpoint.IPTables. |
| func (e *endpoint) IPTables() (iptables.IPTables, error) { |
| return e.stack.IPTables(), nil |
| } |
| |
| // Read reads data from the endpoint. |
| func (e *endpoint) Read(*tcpip.FullAddress) (buffer.View, tcpip.ControlMessages, *tcpip.Error) { |
| e.mu.RLock() |
| // The endpoint can be read if it's connected, or if it's already closed |
| // but has some pending unread data. Also note that a RST being received |
| // would cause the state to become StateError so we should allow the |
| // reads to proceed before returning a ECONNRESET. |
| e.rcvListMu.Lock() |
| bufUsed := e.rcvBufUsed |
| if s := e.state; !s.connected() && s != StateClose && bufUsed == 0 { |
| e.rcvListMu.Unlock() |
| he := e.HardError |
| e.mu.RUnlock() |
| if s == StateError { |
| return buffer.View{}, tcpip.ControlMessages{}, he |
| } |
| e.stats.ReadErrors.InvalidEndpointState.Increment() |
| return buffer.View{}, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState |
| } |
| |
| v, err := e.readLocked() |
| e.rcvListMu.Unlock() |
| |
| e.mu.RUnlock() |
| |
| if err == tcpip.ErrClosedForReceive { |
| e.stats.ReadErrors.ReadClosed.Increment() |
| } |
| return v, tcpip.ControlMessages{}, err |
| } |
| |
| func (e *endpoint) readLocked() (buffer.View, *tcpip.Error) { |
| if e.rcvBufUsed == 0 { |
| if e.rcvClosed || !e.state.connected() { |
| return buffer.View{}, tcpip.ErrClosedForReceive |
| } |
| return buffer.View{}, tcpip.ErrWouldBlock |
| } |
| |
| s := e.rcvList.Front() |
| views := s.data.Views() |
| v := views[s.viewToDeliver] |
| s.viewToDeliver++ |
| |
| if s.viewToDeliver >= len(views) { |
| e.rcvList.Remove(s) |
| s.decRef() |
| } |
| |
| e.rcvBufUsed -= len(v) |
| // If the window was zero before this read and if the read freed up |
| // enough buffer space for the scaled window to be non-zero then notify |
| // the protocol goroutine to send a window update. |
| if e.zeroWindow && !e.zeroReceiveWindow(e.rcv.rcvWndScale) { |
| e.zeroWindow = false |
| e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow) |
| } |
| |
| return v, nil |
| } |
| |
| // isEndpointWritableLocked checks if a given endpoint is writable |
| // and also returns the number of bytes that can be written at this |
| // moment. If the endpoint is not writable then it returns an error |
| // indicating the reason why it's not writable. |
| // Caller must hold e.mu and e.sndBufMu |
| func (e *endpoint) isEndpointWritableLocked() (int, *tcpip.Error) { |
| // The endpoint cannot be written to if it's not connected. |
| if !e.state.connected() { |
| switch e.state { |
| case StateError: |
| return 0, e.HardError |
| default: |
| return 0, tcpip.ErrClosedForSend |
| } |
| } |
| |
| // Check if the connection has already been closed for sends. |
| if e.sndClosed { |
| return 0, tcpip.ErrClosedForSend |
| } |
| |
| avail := e.sndBufSize - e.sndBufUsed |
| if avail <= 0 { |
| return 0, tcpip.ErrWouldBlock |
| } |
| return avail, nil |
| } |
| |
| // Write writes data to the endpoint's peer. |
| func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, <-chan struct{}, *tcpip.Error) { |
| // Linux completely ignores any address passed to sendto(2) for TCP sockets |
| // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More |
| // and opts.EndOfRecord are also ignored. |
| |
| e.mu.RLock() |
| e.sndBufMu.Lock() |
| |
| avail, err := e.isEndpointWritableLocked() |
| if err != nil { |
| e.sndBufMu.Unlock() |
| e.mu.RUnlock() |
| e.stats.WriteErrors.WriteClosed.Increment() |
| return 0, nil, err |
| } |
| |
| // We can release locks while copying data. |
| // |
| // This is not possible if atomic is set, because we can't allow the |
| // available buffer space to be consumed by some other caller while we |
| // are copying data in. |
| if !opts.Atomic { |
| e.sndBufMu.Unlock() |
| e.mu.RUnlock() |
| } |
| |
| // Fetch data. |
| v, perr := p.Payload(avail) |
| if perr != nil || len(v) == 0 { |
| if opts.Atomic { // See above. |
| e.sndBufMu.Unlock() |
| e.mu.RUnlock() |
| } |
| // Note that perr may be nil if len(v) == 0. |
| return 0, nil, perr |
| } |
| |
| if !opts.Atomic { // See above. |
| e.mu.RLock() |
| e.sndBufMu.Lock() |
| |
| // Because we released the lock before copying, check state again |
| // to make sure the endpoint is still in a valid state for a write. |
| avail, err = e.isEndpointWritableLocked() |
| if err != nil { |
| e.sndBufMu.Unlock() |
| e.mu.RUnlock() |
| e.stats.WriteErrors.WriteClosed.Increment() |
| return 0, nil, err |
| } |
| |
| // Discard any excess data copied in due to avail being reduced due |
| // to a simultaneous write call to the socket. |
| if avail < len(v) { |
| v = v[:avail] |
| } |
| } |
| |
| // Add data to the send queue. |
| s := newSegmentFromView(&e.route, e.ID, v) |
| e.sndBufUsed += len(v) |
| e.sndBufInQueue += seqnum.Size(len(v)) |
| e.sndQueue.PushBack(s) |
| e.sndBufMu.Unlock() |
| // Release the endpoint lock to prevent deadlocks due to lock |
| // order inversion when acquiring workMu. |
| e.mu.RUnlock() |
| |
| if e.workMu.TryLock() { |
| // Do the work inline. |
| e.handleWrite() |
| e.workMu.Unlock() |
| } else { |
| // Let the protocol goroutine do the work. |
| e.sndWaker.Assert() |
| } |
| |
| return int64(len(v)), nil, nil |
| } |
| |
| // Peek reads data without consuming it from the endpoint. |
| // |
| // This method does not block if there is no data pending. |
| func (e *endpoint) Peek(vec [][]byte) (int64, tcpip.ControlMessages, *tcpip.Error) { |
| e.mu.RLock() |
| defer e.mu.RUnlock() |
| |
| // The endpoint can be read if it's connected, or if it's already closed |
| // but has some pending unread data. |
| if s := e.state; !s.connected() && s != StateClose { |
| if s == StateError { |
| return 0, tcpip.ControlMessages{}, e.HardError |
| } |
| e.stats.ReadErrors.InvalidEndpointState.Increment() |
| return 0, tcpip.ControlMessages{}, tcpip.ErrInvalidEndpointState |
| } |
| |
| e.rcvListMu.Lock() |
| defer e.rcvListMu.Unlock() |
| |
| if e.rcvBufUsed == 0 { |
| if e.rcvClosed || !e.state.connected() { |
| e.stats.ReadErrors.ReadClosed.Increment() |
| return 0, tcpip.ControlMessages{}, tcpip.ErrClosedForReceive |
| } |
| return 0, tcpip.ControlMessages{}, tcpip.ErrWouldBlock |
| } |
| |
| // Make a copy of vec so we can modify the slide headers. |
| vec = append([][]byte(nil), vec...) |
| |
| var num int64 |
| for s := e.rcvList.Front(); s != nil; s = s.Next() { |
| views := s.data.Views() |
| |
| for i := s.viewToDeliver; i < len(views); i++ { |
| v := views[i] |
| |
| for len(v) > 0 { |
| if len(vec) == 0 { |
| return num, tcpip.ControlMessages{}, nil |
| } |
| if len(vec[0]) == 0 { |
| vec = vec[1:] |
| continue |
| } |
| |
| n := copy(vec[0], v) |
| v = v[n:] |
| vec[0] = vec[0][n:] |
| num += int64(n) |
| } |
| } |
| } |
| |
| return num, tcpip.ControlMessages{}, nil |
| } |
| |
| // zeroReceiveWindow checks if the receive window to be announced now would be |
| // zero, based on the amount of available buffer and the receive window scaling. |
| // |
| // It must be called with rcvListMu held. |
| func (e *endpoint) zeroReceiveWindow(scale uint8) bool { |
| if e.rcvBufUsed >= e.rcvBufSize { |
| return true |
| } |
| |
| return ((e.rcvBufSize - e.rcvBufUsed) >> scale) == 0 |
| } |
| |
| // SetSockOptInt sets a socket option. |
| func (e *endpoint) SetSockOptInt(opt tcpip.SockOpt, v int) *tcpip.Error { |
| switch opt { |
| case tcpip.ReceiveBufferSizeOption: |
| // Make sure the receive buffer size is within the min and max |
| // allowed. |
| var rs ReceiveBufferSizeOption |
| size := int(v) |
| if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil { |
| if size < rs.Min { |
| size = rs.Min |
| } |
| if size > rs.Max { |
| size = rs.Max |
| } |
| } |
| |
| mask := uint32(notifyReceiveWindowChanged) |
| |
| e.rcvListMu.Lock() |
| |
| // Make sure the receive buffer size allows us to send a |
| // non-zero window size. |
| scale := uint8(0) |
| if e.rcv != nil { |
| scale = e.rcv.rcvWndScale |
| } |
| if size>>scale == 0 { |
| size = 1 << scale |
| } |
| |
| // Make sure 2*size doesn't overflow. |
| if size > math.MaxInt32/2 { |
| size = math.MaxInt32 / 2 |
| } |
| |
| e.rcvBufSize = size |
| e.rcvAutoParams.disabled = true |
| if e.zeroWindow && !e.zeroReceiveWindow(scale) { |
| e.zeroWindow = false |
| mask |= notifyNonZeroReceiveWindow |
| } |
| e.rcvListMu.Unlock() |
| |
| e.notifyProtocolGoroutine(mask) |
| return nil |
| |
| case tcpip.SendBufferSizeOption: |
| // Make sure the send buffer size is within the min and max |
| // allowed. |
| size := int(v) |
| var ss SendBufferSizeOption |
| if err := e.stack.TransportProtocolOption(ProtocolNumber, &ss); err == nil { |
| if size < ss.Min { |
| size = ss.Min |
| } |
| if size > ss.Max { |
| size = ss.Max |
| } |
| } |
| |
| e.sndBufMu.Lock() |
| e.sndBufSize = size |
| e.sndBufMu.Unlock() |
| return nil |
| |
| default: |
| return nil |
| } |
| } |
| |
| // SetSockOpt sets a socket option. |
| func (e *endpoint) SetSockOpt(opt interface{}) *tcpip.Error { |
| // Lower 2 bits represents ECN bits. RFC 3168, section 23.1 |
| const inetECNMask = 3 |
| switch v := opt.(type) { |
| case tcpip.DelayOption: |
| if v == 0 { |
| atomic.StoreUint32(&e.delay, 0) |
| |
| // Handle delayed data. |
| e.sndWaker.Assert() |
| } else { |
| atomic.StoreUint32(&e.delay, 1) |
| } |
| return nil |
| |
| case tcpip.CorkOption: |
| if v == 0 { |
| atomic.StoreUint32(&e.cork, 0) |
| |
| // Handle the corked data. |
| e.sndWaker.Assert() |
| } else { |
| atomic.StoreUint32(&e.cork, 1) |
| } |
| return nil |
| |
| case tcpip.ReuseAddressOption: |
| e.mu.Lock() |
| e.reuseAddr = v != 0 |
| e.mu.Unlock() |
| return nil |
| |
| case tcpip.ReusePortOption: |
| e.mu.Lock() |
| e.reusePort = v != 0 |
| e.mu.Unlock() |
| return nil |
| |
| case tcpip.BindToDeviceOption: |
| e.mu.Lock() |
| defer e.mu.Unlock() |
| if v == "" { |
| e.bindToDevice = 0 |
| return nil |
| } |
| for nicid, nic := range e.stack.NICInfo() { |
| if nic.Name == string(v) { |
| e.bindToDevice = nicid |
| return nil |
| } |
| } |
| return tcpip.ErrUnknownDevice |
| |
| case tcpip.QuickAckOption: |
| if v == 0 { |
| atomic.StoreUint32(&e.slowAck, 1) |
| } else { |
| atomic.StoreUint32(&e.slowAck, 0) |
| } |
| return nil |
| |
| case tcpip.MaxSegOption: |
| userMSS := v |
| if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS { |
| return tcpip.ErrInvalidOptionValue |
| } |
| e.mu.Lock() |
| e.userMSS = int(userMSS) |
| e.mu.Unlock() |
| e.notifyProtocolGoroutine(notifyMSSChanged) |
| return nil |
| |
| case tcpip.V6OnlyOption: |
| // We only recognize this option on v6 endpoints. |
| if e.NetProto != header.IPv6ProtocolNumber { |
| return tcpip.ErrInvalidEndpointState |
| } |
| |
| e.mu.Lock() |
| defer e.mu.Unlock() |
| |
| // We only allow this to be set when we're in the initial state. |
| if e.state != StateInitial { |
| return tcpip.ErrInvalidEndpointState |
| } |
| |
| e.v6only = v != 0 |
| return nil |
| |
| case tcpip.TTLOption: |
| e.mu.Lock() |
| e.ttl = uint8(v) |
| e.mu.Unlock() |
| return nil |
| |
| case tcpip.KeepaliveEnabledOption: |
| e.keepalive.Lock() |
| e.keepalive.enabled = v != 0 |
| e.keepalive.Unlock() |
| e.notifyProtocolGoroutine(notifyKeepaliveChanged) |
| return nil |
| |
| case tcpip.KeepaliveIdleOption: |
| e.keepalive.Lock() |
| e.keepalive.idle = time.Duration(v) |
| e.keepalive.Unlock() |
| e.notifyProtocolGoroutine(notifyKeepaliveChanged) |
| return nil |
| |
| case tcpip.KeepaliveIntervalOption: |
| e.keepalive.Lock() |
| e.keepalive.interval = time.Duration(v) |
| e.keepalive.Unlock() |
| e.notifyProtocolGoroutine(notifyKeepaliveChanged) |
| return nil |
| |
| case tcpip.KeepaliveCountOption: |
| e.keepalive.Lock() |
| e.keepalive.count = int(v) |
| e.keepalive.Unlock() |
| e.notifyProtocolGoroutine(notifyKeepaliveChanged) |
| return nil |
| |
| case tcpip.BroadcastOption: |
| e.mu.Lock() |
| e.broadcast = v != 0 |
| e.mu.Unlock() |
| return nil |
| |
| case tcpip.CongestionControlOption: |
| // Query the available cc algorithms in the stack and |
| // validate that the specified algorithm is actually |
| // supported in the stack. |
| var avail tcpip.AvailableCongestionControlOption |
| if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil { |
| return err |
| } |
| availCC := strings.Split(string(avail), " ") |
| for _, cc := range availCC { |
| if v == tcpip.CongestionControlOption(cc) { |
| // Acquire the work mutex as we may need to |
| // reinitialize the congestion control state. |
| e.mu.Lock() |
| state := e.state |
| e.cc = v |
| e.mu.Unlock() |
| switch state { |
| case StateEstablished: |
| e.workMu.Lock() |
| e.mu.Lock() |
| if e.state == state { |
| e.snd.cc = e.snd.initCongestionControl(e.cc) |
| } |
| e.mu.Unlock() |
| e.workMu.Unlock() |
| } |
| return nil |
| } |
| } |
| |
| // Linux returns ENOENT when an invalid congestion |
| // control algorithm is specified. |
| return tcpip.ErrNoSuchFile |
| |
| case tcpip.IPv4TOSOption: |
| e.mu.Lock() |
| // TODO(gvisor.dev/issue/995): ECN is not currently supported, |
| // ignore the bits for now. |
| e.sendTOS = uint8(v) & ^uint8(inetECNMask) |
| e.mu.Unlock() |
| return nil |
| |
| case tcpip.IPv6TrafficClassOption: |
| e.mu.Lock() |
| // TODO(gvisor.dev/issue/995): ECN is not currently supported, |
| // ignore the bits for now. |
| e.sendTOS = uint8(v) & ^uint8(inetECNMask) |
| e.mu.Unlock() |
| return nil |
| |
| default: |
| return nil |
| } |
| } |
| |
| // readyReceiveSize returns the number of bytes ready to be received. |
| func (e *endpoint) readyReceiveSize() (int, *tcpip.Error) { |
| e.mu.RLock() |
| defer e.mu.RUnlock() |
| |
| // The endpoint cannot be in listen state. |
| if e.state == StateListen { |
| return 0, tcpip.ErrInvalidEndpointState |
| } |
| |
| e.rcvListMu.Lock() |
| defer e.rcvListMu.Unlock() |
| |
| return e.rcvBufUsed, nil |
| } |
| |
| // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. |
| func (e *endpoint) GetSockOptInt(opt tcpip.SockOpt) (int, *tcpip.Error) { |
| switch opt { |
| case tcpip.ReceiveQueueSizeOption: |
| return e.readyReceiveSize() |
| case tcpip.SendBufferSizeOption: |
| e.sndBufMu.Lock() |
| v := e.sndBufSize |
| e.sndBufMu.Unlock() |
| return v, nil |
| |
| case tcpip.ReceiveBufferSizeOption: |
| e.rcvListMu.Lock() |
| v := e.rcvBufSize |
| e.rcvListMu.Unlock() |
| return v, nil |
| |
| } |
| return -1, tcpip.ErrUnknownProtocolOption |
| } |
| |
| // GetSockOpt implements tcpip.Endpoint.GetSockOpt. |
| func (e *endpoint) GetSockOpt(opt interface{}) *tcpip.Error { |
| switch o := opt.(type) { |
| case tcpip.ErrorOption: |
| e.lastErrorMu.Lock() |
| err := e.lastError |
| e.lastError = nil |
| e.lastErrorMu.Unlock() |
| return err |
| |
| case *tcpip.MaxSegOption: |
| // This is just stubbed out. Linux never returns the user_mss |
| // value as it either returns the defaultMSS or returns the |
| // actual current MSS. Netstack just returns the defaultMSS |
| // always for now. |
| *o = header.TCPDefaultMSS |
| return nil |
| |
| case *tcpip.DelayOption: |
| *o = 0 |
| if v := atomic.LoadUint32(&e.delay); v != 0 { |
| *o = 1 |
| } |
| return nil |
| |
| case *tcpip.CorkOption: |
| *o = 0 |
| if v := atomic.LoadUint32(&e.cork); v != 0 { |
| *o = 1 |
| } |
| return nil |
| |
| case *tcpip.ReuseAddressOption: |
| e.mu.RLock() |
| v := e.reuseAddr |
| e.mu.RUnlock() |
| |
| *o = 0 |
| if v { |
| *o = 1 |
| } |
| return nil |
| |
| case *tcpip.ReusePortOption: |
| e.mu.RLock() |
| v := e.reusePort |
| e.mu.RUnlock() |
| |
| *o = 0 |
| if v { |
| *o = 1 |
| } |
| return nil |
| |
| case *tcpip.BindToDeviceOption: |
| e.mu.RLock() |
| defer e.mu.RUnlock() |
| if nic, ok := e.stack.NICInfo()[e.bindToDevice]; ok { |
| *o = tcpip.BindToDeviceOption(nic.Name) |
| return nil |
| } |
| *o = "" |
| return nil |
| |
| case *tcpip.QuickAckOption: |
| *o = 1 |
| if v := atomic.LoadUint32(&e.slowAck); v != 0 { |
| *o = 0 |
| } |
| return nil |
| |
| case *tcpip.V6OnlyOption: |
| // We only recognize this option on v6 endpoints. |
| if e.NetProto != header.IPv6ProtocolNumber { |
| return tcpip.ErrUnknownProtocolOption |
| } |
| |
| e.mu.Lock() |
| v := e.v6only |
| e.mu.Unlock() |
| |
| *o = 0 |
| if v { |
| *o = 1 |
| } |
| return nil |
| |
| case *tcpip.TTLOption: |
| e.mu.Lock() |
| *o = tcpip.TTLOption(e.ttl) |
| e.mu.Unlock() |
| return nil |
| |
| case *tcpip.TCPInfoOption: |
| *o = tcpip.TCPInfoOption{} |
| e.mu.RLock() |
| snd := e.snd |
| e.mu.RUnlock() |
| if snd != nil { |
| snd.rtt.Lock() |
| o.RTT = snd.rtt.srtt |
| o.RTTVar = snd.rtt.rttvar |
| snd.rtt.Unlock() |
| } |
| return nil |
| |
| case *tcpip.KeepaliveEnabledOption: |
| e.keepalive.Lock() |
| v := e.keepalive.enabled |
| e.keepalive.Unlock() |
| |
| *o = 0 |
| if v { |
| *o = 1 |
| } |
| return nil |
| |
| case *tcpip.KeepaliveIdleOption: |
| e.keepalive.Lock() |
| *o = tcpip.KeepaliveIdleOption(e.keepalive.idle) |
| e.keepalive.Unlock() |
| return nil |
| |
| case *tcpip.KeepaliveIntervalOption: |
| e.keepalive.Lock() |
| *o = tcpip.KeepaliveIntervalOption(e.keepalive.interval) |
| e.keepalive.Unlock() |
| return nil |
| |
| case *tcpip.KeepaliveCountOption: |
| e.keepalive.Lock() |
| *o = tcpip.KeepaliveCountOption(e.keepalive.count) |
| e.keepalive.Unlock() |
| return nil |
| |
| case *tcpip.OutOfBandInlineOption: |
| // We don't currently support disabling this option. |
| *o = 1 |
| return nil |
| |
| case *tcpip.BroadcastOption: |
| e.mu.Lock() |
| v := e.broadcast |
| e.mu.Unlock() |
| |
| *o = 0 |
| if v { |
| *o = 1 |
| } |
| return nil |
| |
| case *tcpip.CongestionControlOption: |
| e.mu.Lock() |
| *o = e.cc |
| e.mu.Unlock() |
| return nil |
| |
| case *tcpip.IPv4TOSOption: |
| e.mu.RLock() |
| *o = tcpip.IPv4TOSOption(e.sendTOS) |
| e.mu.RUnlock() |
| return nil |
| |
| case *tcpip.IPv6TrafficClassOption: |
| e.mu.RLock() |
| *o = tcpip.IPv6TrafficClassOption(e.sendTOS) |
| e.mu.RUnlock() |
| return nil |
| |
| default: |
| return tcpip.ErrUnknownProtocolOption |
| } |
| } |
| |
| func (e *endpoint) checkV4Mapped(addr *tcpip.FullAddress) (tcpip.NetworkProtocolNumber, *tcpip.Error) { |
| netProto := e.NetProto |
| if header.IsV4MappedAddress(addr.Addr) { |
| // Fail if using a v4 mapped address on a v6only endpoint. |
| if e.v6only { |
| return 0, tcpip.ErrNoRoute |
| } |
| |
| netProto = header.IPv4ProtocolNumber |
| addr.Addr = addr.Addr[header.IPv6AddressSize-header.IPv4AddressSize:] |
| if addr.Addr == header.IPv4Any { |
| addr.Addr = "" |
| } |
| } |
| |
| // Fail if we're bound to an address length different from the one we're |
| // checking. |
| if l := len(e.ID.LocalAddress); l != 0 && len(addr.Addr) != 0 && l != len(addr.Addr) { |
| return 0, tcpip.ErrInvalidEndpointState |
| } |
| |
| return netProto, nil |
| } |
| |
| // Disconnect implements tcpip.Endpoint.Disconnect. |
| func (*endpoint) Disconnect() *tcpip.Error { |
| return tcpip.ErrNotSupported |
| } |
| |
| // Connect connects the endpoint to its peer. |
| func (e *endpoint) Connect(addr tcpip.FullAddress) *tcpip.Error { |
| err := e.connect(addr, true, true) |
| if err != nil && !err.IgnoreStats() { |
| e.stack.Stats().TCP.FailedConnectionAttempts.Increment() |
| e.stats.FailedConnectionAttempts.Increment() |
| } |
| return err |
| } |
| |
| // connect connects the endpoint to its peer. In the normal non-S/R case, the |
| // new connection is expected to run the main goroutine and perform handshake. |
| // In restore of previously connected endpoints, both ends will be passively |
| // created (so no new handshaking is done); for stack-accepted connections not |
| // yet accepted by the app, they are restored without running the main goroutine |
| // here. |
| func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) *tcpip.Error { |
| e.mu.Lock() |
| defer e.mu.Unlock() |
| |
| connectingAddr := addr.Addr |
| |
| netProto, err := e.checkV4Mapped(&addr) |
| if err != nil { |
| return err |
| } |
| |
| if e.state.connected() { |
| // The endpoint is already connected. If caller hasn't been |
| // notified yet, return success. |
| if !e.isConnectNotified { |
| e.isConnectNotified = true |
| return nil |
| } |
| // Otherwise return that it's already connected. |
| return tcpip.ErrAlreadyConnected |
| } |
| |
| nicid := addr.NIC |
| switch e.state { |
| case StateBound: |
| // If we're already bound to a NIC but the caller is requesting |
| // that we use a different one now, we cannot proceed. |
| if e.boundNICID == 0 { |
| break |
| } |
| |
| if nicid != 0 && nicid != e.boundNICID { |
| return tcpip.ErrNoRoute |
| } |
| |
| nicid = e.boundNICID |
| |
| case StateInitial: |
| // Nothing to do. We'll eventually fill-in the gaps in the ID (if any) |
| // when we find a route. |
| |
| case StateConnecting, StateSynSent, StateSynRecv: |
| // A connection request has already been issued but hasn't completed |
| // yet. |
| return tcpip.ErrAlreadyConnecting |
| |
| case StateError: |
| return e.HardError |
| |
| default: |
| return tcpip.ErrInvalidEndpointState |
| } |
| |
| // Find a route to the desired destination. |
| r, err := e.stack.FindRoute(nicid, e.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */) |
| if err != nil { |
| return err |
| } |
| defer r.Release() |
| |
| origID := e.ID |
| |
| netProtos := []tcpip.NetworkProtocolNumber{netProto} |
| e.ID.LocalAddress = r.LocalAddress |
| e.ID.RemoteAddress = r.RemoteAddress |
| e.ID.RemotePort = addr.Port |
| |
| if e.ID.LocalPort != 0 { |
| // The endpoint is bound to a port, attempt to register it. |
| err := e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, e.ID, e, e.reusePort, e.bindToDevice) |
| if err != nil { |
| return err |
| } |
| } else { |
| // The endpoint doesn't have a local port yet, so try to get |
| // one. Make sure that it isn't one that will result in the same |
| // address/port for both local and remote (otherwise this |
| // endpoint would be trying to connect to itself). |
| sameAddr := e.ID.LocalAddress == e.ID.RemoteAddress |
| |
| // Calculate a port offset based on the destination IP/port and |
| // src IP to ensure that for a given tuple (srcIP, destIP, |
| // destPort) the offset used as a starting point is the same to |
| // ensure that we can cycle through the port space effectively. |
| h := jenkins.Sum32(e.stack.PortSeed()) |
| h.Write([]byte(e.ID.LocalAddress)) |
| h.Write([]byte(e.ID.RemoteAddress)) |
| portBuf := make([]byte, 2) |
| binary.LittleEndian.PutUint16(portBuf, e.ID.RemotePort) |
| h.Write(portBuf) |
| portOffset := h.Sum32() |
| |
| if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, *tcpip.Error) { |
| if sameAddr && p == e.ID.RemotePort { |
| return false, nil |
| } |
| // reusePort is false below because connect cannot reuse a port even if |
| // reusePort was set. |
| if !e.stack.IsPortAvailable(netProtos, ProtocolNumber, e.ID.LocalAddress, p, false /* reusePort */, e.bindToDevice) { |
| return false, nil |
| } |
| |
| id := e.ID |
| id.LocalPort = p |
| switch e.stack.RegisterTransportEndpoint(nicid, netProtos, ProtocolNumber, id, e, e.reusePort, e.bindToDevice) { |
| case nil: |
| e.ID = id |
| return true, nil |
| case tcpip.ErrPortInUse: |
| return false, nil |
| default: |
| return false, err |
| } |
| }); err != nil { |
| return err |
| } |
| } |
| |
| // Remove the port reservation. This can happen when Bind is called |
| // before Connect: in such a case we don't want to hold on to |
| // reservations anymore. |
| if e.isPortReserved { |
| e.stack.ReleasePort(e.effectiveNetProtos, ProtocolNumber, origID.LocalAddress, origID.LocalPort, e.bindToDevice) |
| e.isPortReserved = false |
| } |
| |
| e.isRegistered = true |
| e.state = StateConnecting |
| e.route = r.Clone() |
| e.boundNICID = nicid |
| e.effectiveNetProtos = netProtos |
| e.connectingAddress = connectingAddr |
| |
| e.initGSO() |
| |
| // Connect in the restore phase does not perform handshake. Restore its |
| // connection setting here. |
| if !handshake { |
| e.segmentQueue.mu.Lock() |
| for _, l := range []segmentList{e.segmentQueue.list, e.sndQueue, e.snd.writeList} { |
| for s := l.Front(); s != nil; s = s.Next() { |
| s.id = e.ID |
| s.route = r.Clone() |
| e.sndWaker.Assert() |
| } |
| } |
| e.segmentQueue.mu.Unlock() |
| e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0) |
| e.state = StateEstablished |
| e.stack.Stats().TCP.CurrentEstablished.Increment() |
| } |
| |
| if run { |
| e.workerRunning = true |
| e.stack.Stats().TCP.ActiveConnectionOpenings.Increment() |
| go e.protocolMainLoop(handshake) |
| } |
| |
| return tcpip.ErrConnectStarted |
| } |
| |
| // ConnectEndpoint is not supported. |
| func (*endpoint) ConnectEndpoint(tcpip.Endpoint) *tcpip.Error { |
| return tcpip.ErrInvalidEndpointState |
| } |
| |
| // Shutdown closes the read and/or write end of the endpoint connection to its |
| // peer. |
| func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) *tcpip.Error { |
| e.mu.Lock() |
| defer e.mu.Unlock() |
| e.shutdownFlags |= flags |
| |
| switch { |
| case e.state.connected(): |
| // Close for read. |
| if (e.shutdownFlags & tcpip.ShutdownRead) != 0 { |
| // Mark read side as closed. |
| e.rcvListMu.Lock() |
| e.rcvClosed = true |
| rcvBufUsed := e.rcvBufUsed |
| e.rcvListMu.Unlock() |
| |
| // If we're fully closed and we have unread data we need to abort |
| // the connection with a RST. |
| if (e.shutdownFlags&tcpip.ShutdownWrite) != 0 && rcvBufUsed > 0 { |
| e.notifyProtocolGoroutine(notifyReset) |
| return nil |
| } |
| } |
| |
| // Close for write. |
| if (e.shutdownFlags & tcpip.ShutdownWrite) != 0 { |
| e.sndBufMu.Lock() |
| |
| if e.sndClosed { |
| // Already closed. |
| e.sndBufMu.Unlock() |
| break |
| } |
| |
| // Queue fin segment. |
| s := newSegmentFromView(&e.route, e.ID, nil) |
| e.sndQueue.PushBack(s) |
| e.sndBufInQueue++ |
| |
| // Mark endpoint as closed. |
| e.sndClosed = true |
| |
| e.sndBufMu.Unlock() |
| |
| // Tell protocol goroutine to close. |
| e.sndCloseWaker.Assert() |
| } |
| |
| case e.state == StateListen: |
| // Tell protocolListenLoop to stop. |
| if flags&tcpip.ShutdownRead != 0 { |
| e.notifyProtocolGoroutine(notifyClose) |
| } |
| |
| default: |
| return tcpip.ErrNotConnected |
| } |
| |
| return nil |
| } |
| |
| // Listen puts the endpoint in "listen" mode, which allows it to accept |
| // new connections. |
| func (e *endpoint) Listen(backlog int) *tcpip.Error { |
| err := e.listen(backlog) |
| if err != nil && !err.IgnoreStats() { |
| e.stack.Stats().TCP.FailedConnectionAttempts.Increment() |
| e.stats.FailedConnectionAttempts.Increment() |
| } |
| return err |
| } |
| |
| func (e *endpoint) listen(backlog int) *tcpip.Error { |
| e.mu.Lock() |
| defer e.mu.Unlock() |
| |
| // Allow the backlog to be adjusted if the endpoint is not shutting down. |
| // When the endpoint shuts down, it sets workerCleanup to true, and from |
| // that point onward, acceptedChan is the responsibility of the cleanup() |
| // method (and should not be touched anywhere else, including here). |
| if e.state == StateListen && !e.workerCleanup { |
| // Adjust the size of the channel iff we can fix existing |
| // pending connections into the new one. |
| if len(e.acceptedChan) > backlog { |
| return tcpip.ErrInvalidEndpointState |
| } |
| if cap(e.acceptedChan) == backlog { |
| return nil |
| } |
| origChan := e.acceptedChan |
| e.acceptedChan = make(chan *endpoint, backlog) |
| close(origChan) |
| for ep := range origChan { |
| e.acceptedChan <- ep |
| } |
| return nil |
| } |
| |
| // Endpoint must be bound before it can transition to listen mode. |
| if e.state != StateBound { |
| e.stats.ReadErrors.InvalidEndpointState.Increment() |
| return tcpip.ErrInvalidEndpointState |
| } |
| |
| // Register the endpoint. |
| if err := e.stack.RegisterTransportEndpoint(e.boundNICID, e.effectiveNetProtos, ProtocolNumber, e.ID, e, e.reusePort, e.bindToDevice); err != nil { |
| return err |
| } |
| |
| e.isRegistered = true |
| e.state = StateListen |
| if e.acceptedChan == nil { |
| e.acceptedChan = make(chan *endpoint, backlog) |
| } |
| e.workerRunning = true |
| |
| go e.protocolListenLoop( |
| seqnum.Size(e.receiveBufferAvailable())) |
| |
| return nil |
| } |
| |
| // startAcceptedLoop sets up required state and starts a goroutine with the |
| // main loop for accepted connections. |
| func (e *endpoint) startAcceptedLoop(waiterQueue *waiter.Queue) { |
| e.waiterQueue = waiterQueue |
| e.workerRunning = true |
| go e.protocolMainLoop(false) |
| } |
| |
| // Accept returns a new endpoint if a peer has established a connection |
| // to an endpoint previously set to listen mode. |
| func (e *endpoint) Accept() (tcpip.Endpoint, *waiter.Queue, *tcpip.Error) { |
| e.mu.RLock() |
| defer e.mu.RUnlock() |
| |
| // Endpoint must be in listen state before it can accept connections. |
| if e.state != StateListen { |
| return nil, nil, tcpip.ErrInvalidEndpointState |
| } |
| |
| // Get the new accepted endpoint. |
| var n *endpoint |
| select { |
| case n = <-e.acceptedChan: |
| default: |
| return nil, nil, tcpip.ErrWouldBlock |
| } |
| |
| // Start the protocol goroutine. |
| wq := &waiter.Queue{} |
| n.startAcceptedLoop(wq) |
| e.stack.Stats().TCP.PassiveConnectionOpenings.Increment() |
| |
| return n, wq, nil |
| } |
| |
| // Bind binds the endpoint to a specific local port and optionally address. |
| func (e *endpoint) Bind(addr tcpip.FullAddress) (err *tcpip.Error) { |
| e.mu.Lock() |
| defer e.mu.Unlock() |
| |
| // Don't allow binding once endpoint is not in the initial state |
| // anymore. This is because once the endpoint goes into a connected or |
| // listen state, it is already bound. |
| if e.state != StateInitial { |
| return tcpip.ErrAlreadyBound |
| } |
| |
| e.BindAddr = addr.Addr |
| netProto, err := e.checkV4Mapped(&addr) |
| if err != nil { |
| return err |
| } |
| |
| // Expand netProtos to include v4 and v6 if the caller is binding to a |
| // wildcard (empty) address, and this is an IPv6 endpoint with v6only |
| // set to false. |
| netProtos := []tcpip.NetworkProtocolNumber{netProto} |
| if netProto == header.IPv6ProtocolNumber && !e.v6only && addr.Addr == "" { |
| netProtos = []tcpip.NetworkProtocolNumber{ |
| header.IPv6ProtocolNumber, |
| header.IPv4ProtocolNumber, |
| } |
| } |
| |
| port, err := e.stack.ReservePort(netProtos, ProtocolNumber, addr.Addr, addr.Port, e.reusePort, e.bindToDevice) |
| if err != nil { |
| return err |
| } |
| |
| e.isPortReserved = true |
| e.effectiveNetProtos = netProtos |
| e.ID.LocalPort = port |
| |
| // Any failures beyond this point must remove the port registration. |
| defer func(bindToDevice tcpip.NICID) { |
| if err != nil { |
| e.stack.ReleasePort(netProtos, ProtocolNumber, addr.Addr, port, bindToDevice) |
| e.isPortReserved = false |
| e.effectiveNetProtos = nil |
| e.ID.LocalPort = 0 |
| e.ID.LocalAddress = "" |
| e.boundNICID = 0 |
| } |
| }(e.bindToDevice) |
| |
| // If an address is specified, we must ensure that it's one of our |
| // local addresses. |
| if len(addr.Addr) != 0 { |
| nic := e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) |
| if nic == 0 { |
| return tcpip.ErrBadLocalAddress |
| } |
| |
| e.boundNICID = nic |
| e.ID.LocalAddress = addr.Addr |
| } |
| |
| // Mark endpoint as bound. |
| e.state = StateBound |
| |
| return nil |
| } |
| |
| // GetLocalAddress returns the address to which the endpoint is bound. |
| func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, *tcpip.Error) { |
| e.mu.RLock() |
| defer e.mu.RUnlock() |
| |
| return tcpip.FullAddress{ |
| Addr: e.ID.LocalAddress, |
| Port: e.ID.LocalPort, |
| NIC: e.boundNICID, |
| }, nil |
| } |
| |
| // GetRemoteAddress returns the address to which the endpoint is connected. |
| func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, *tcpip.Error) { |
| e.mu.RLock() |
| defer e.mu.RUnlock() |
| |
| if !e.state.connected() { |
| return tcpip.FullAddress{}, tcpip.ErrNotConnected |
| } |
| |
| return tcpip.FullAddress{ |
| Addr: e.ID.RemoteAddress, |
| Port: e.ID.RemotePort, |
| NIC: e.boundNICID, |
| }, nil |
| } |
| |
| // HandlePacket is called by the stack when new packets arrive to this transport |
| // endpoint. |
| func (e *endpoint) HandlePacket(r *stack.Route, id stack.TransportEndpointID, vv buffer.VectorisedView) { |
| s := newSegment(r, id, vv) |
| if !s.parse() { |
| e.stack.Stats().MalformedRcvdPackets.Increment() |
| e.stack.Stats().TCP.InvalidSegmentsReceived.Increment() |
| e.stats.ReceiveErrors.MalformedPacketsReceived.Increment() |
| s.decRef() |
| return |
| } |
| |
| if !s.csumValid { |
| e.stack.Stats().MalformedRcvdPackets.Increment() |
| e.stack.Stats().TCP.ChecksumErrors.Increment() |
| e.stats.ReceiveErrors.ChecksumErrors.Increment() |
| s.decRef() |
| return |
| } |
| |
| e.stack.Stats().TCP.ValidSegmentsReceived.Increment() |
| e.stats.SegmentsReceived.Increment() |
| if (s.flags & header.TCPFlagRst) != 0 { |
| e.stack.Stats().TCP.ResetsReceived.Increment() |
| } |
| |
| // Send packet to worker goroutine. |
| if e.segmentQueue.enqueue(s) { |
| e.newSegmentWaker.Assert() |
| } else { |
| // The queue is full, so we drop the segment. |
| e.stack.Stats().DroppedPackets.Increment() |
| e.stats.ReceiveErrors.SegmentQueueDropped.Increment() |
| s.decRef() |
| } |
| } |
| |
| // HandleControlPacket implements stack.TransportEndpoint.HandleControlPacket. |
| func (e *endpoint) HandleControlPacket(id stack.TransportEndpointID, typ stack.ControlType, extra uint32, vv buffer.VectorisedView) { |
| switch typ { |
| case stack.ControlPacketTooBig: |
| e.sndBufMu.Lock() |
| e.packetTooBigCount++ |
| if v := int(extra); v < e.sndMTU { |
| e.sndMTU = v |
| } |
| e.sndBufMu.Unlock() |
| |
| e.notifyProtocolGoroutine(notifyMTUChanged) |
| } |
| } |
| |
| // updateSndBufferUsage is called by the protocol goroutine when room opens up |
| // in the send buffer. The number of newly available bytes is v. |
| func (e *endpoint) updateSndBufferUsage(v int) { |
| e.sndBufMu.Lock() |
| notify := e.sndBufUsed >= e.sndBufSize>>1 |
| e.sndBufUsed -= v |
| // We only notify when there is half the sndBufSize available after |
| // a full buffer event occurs. This ensures that we don't wake up |
| // writers to queue just 1-2 segments and go back to sleep. |
| notify = notify && e.sndBufUsed < e.sndBufSize>>1 |
| e.sndBufMu.Unlock() |
| |
| if notify { |
| e.waiterQueue.Notify(waiter.EventOut) |
| } |
| } |
| |
| // readyToRead is called by the protocol goroutine when a new segment is ready |
| // to be read, or when the connection is closed for receiving (in which case |
| // s will be nil). |
| func (e *endpoint) readyToRead(s *segment) { |
| e.rcvListMu.Lock() |
| if s != nil { |
| s.incRef() |
| e.rcvBufUsed += s.data.Size() |
| // Check if the receive window is now closed. If so make sure |
| // we set the zero window before we deliver the segment to ensure |
| // that a subsequent read of the segment will correctly trigger |
| // a non-zero notification. |
| if avail := e.receiveBufferAvailableLocked(); avail>>e.rcv.rcvWndScale == 0 { |
| e.stats.ReceiveErrors.ZeroRcvWindowState.Increment() |
| e.zeroWindow = true |
| } |
| e.rcvList.PushBack(s) |
| } else { |
| e.rcvClosed = true |
| } |
| e.rcvListMu.Unlock() |
| |
| e.waiterQueue.Notify(waiter.EventIn) |
| } |
| |
| // receiveBufferAvailableLocked calculates how many bytes are still available |
| // in the receive buffer. |
| // rcvListMu must be held when this function is called. |
| func (e *endpoint) receiveBufferAvailableLocked() int { |
| // We may use more bytes than the buffer size when the receive buffer |
| // shrinks. |
| if e.rcvBufUsed >= e.rcvBufSize { |
| return 0 |
| } |
| |
| return e.rcvBufSize - e.rcvBufUsed |
| } |
| |
| // receiveBufferAvailable calculates how many bytes are still available in the |
| // receive buffer. |
| func (e *endpoint) receiveBufferAvailable() int { |
| e.rcvListMu.Lock() |
| available := e.receiveBufferAvailableLocked() |
| e.rcvListMu.Unlock() |
| return available |
| } |
| |
| func (e *endpoint) receiveBufferSize() int { |
| e.rcvListMu.Lock() |
| size := e.rcvBufSize |
| e.rcvListMu.Unlock() |
| |
| return size |
| } |
| |
| func (e *endpoint) maxReceiveBufferSize() int { |
| var rs ReceiveBufferSizeOption |
| if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil { |
| // As a fallback return the hardcoded max buffer size. |
| return MaxBufferSize |
| } |
| return rs.Max |
| } |
| |
| // rcvWndScaleForHandshake computes the receive window scale to offer to the |
| // peer when window scaling is enabled (true by default). If auto-tuning is |
| // disabled then the window scaling factor is based on the size of the |
| // receiveBuffer otherwise we use the max permissible receive buffer size to |
| // compute the scale. |
| func (e *endpoint) rcvWndScaleForHandshake() int { |
| bufSizeForScale := e.receiveBufferSize() |
| |
| e.rcvListMu.Lock() |
| autoTuningDisabled := e.rcvAutoParams.disabled |
| e.rcvListMu.Unlock() |
| if autoTuningDisabled { |
| return FindWndScale(seqnum.Size(bufSizeForScale)) |
| } |
| |
| return FindWndScale(seqnum.Size(e.maxReceiveBufferSize())) |
| } |
| |
| // updateRecentTimestamp updates the recent timestamp using the algorithm |
| // described in https://tools.ietf.org/html/rfc7323#section-4.3 |
| func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) { |
| if e.sendTSOk && seqnum.Value(e.recentTS).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) { |
| e.recentTS = tsVal |
| } |
| } |
| |
| // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if |
| // the SYN options indicate that timestamp option was negotiated. It also |
| // initializes the recentTS with the value provided in synOpts.TSval. |
| func (e *endpoint) maybeEnableTimestamp(synOpts *header.TCPSynOptions) { |
| if synOpts.TS { |
| e.sendTSOk = true |
| e.recentTS = synOpts.TSVal |
| } |
| } |
| |
| // timestamp returns the timestamp value to be used in the TSVal field of the |
| // timestamp option for outgoing TCP segments for a given endpoint. |
| func (e *endpoint) timestamp() uint32 { |
| return tcpTimeStamp(e.tsOffset) |
| } |
| |
| // tcpTimeStamp returns a timestamp offset by the provided offset. This is |
| // not inlined above as it's used when SYN cookies are in use and endpoint |
| // is not created at the time when the SYN cookie is sent. |
| func tcpTimeStamp(offset uint32) uint32 { |
| now := time.Now() |
| return uint32(now.Unix()*1000+int64(now.Nanosecond()/1e6)) + offset |
| } |
| |
| // timeStampOffset returns a randomized timestamp offset to be used when sending |
| // timestamp values in a timestamp option for a TCP segment. |
| func timeStampOffset() uint32 { |
| b := make([]byte, 4) |
| if _, err := rand.Read(b); err != nil { |
| panic(err) |
| } |
| // Initialize a random tsOffset that will be added to the recentTS |
| // everytime the timestamp is sent when the Timestamp option is enabled. |
| // |
| // See https://tools.ietf.org/html/rfc7323#section-5.4 for details on |
| // why this is required. |
| // |
| // NOTE: This is not completely to spec as normally this should be |
| // initialized in a manner analogous to how sequence numbers are |
| // randomized per connection basis. But for now this is sufficient. |
| return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 |
| } |
| |
| // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint |
| // if the SYN options indicate that the SACK option was negotiated and the TCP |
| // stack is configured to enable TCP SACK option. |
| func (e *endpoint) maybeEnableSACKPermitted(synOpts *header.TCPSynOptions) { |
| var v SACKEnabled |
| if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil { |
| // Stack doesn't support SACK. So just return. |
| return |
| } |
| if bool(v) && synOpts.SACKPermitted { |
| e.sackPermitted = true |
| } |
| } |
| |
| // maxOptionSize return the maximum size of TCP options. |
| func (e *endpoint) maxOptionSize() (size int) { |
| var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock |
| options := e.makeOptions(maxSackBlocks[:]) |
| size = len(options) |
| putOptions(options) |
| |
| return size |
| } |
| |
| // completeState makes a full copy of the endpoint and returns it. This is used |
| // before invoking the probe. The state returned may not be fully consistent if |
| // there are intervening syscalls when the state is being copied. |
| func (e *endpoint) completeState() stack.TCPEndpointState { |
| var s stack.TCPEndpointState |
| s.SegTime = time.Now() |
| |
| // Copy EndpointID. |
| e.mu.Lock() |
| s.ID = stack.TCPEndpointID(e.ID) |
| e.mu.Unlock() |
| |
| // Copy endpoint rcv state. |
| e.rcvListMu.Lock() |
| s.RcvBufSize = e.rcvBufSize |
| s.RcvBufUsed = e.rcvBufUsed |
| s.RcvClosed = e.rcvClosed |
| s.RcvAutoParams.MeasureTime = e.rcvAutoParams.measureTime |
| s.RcvAutoParams.CopiedBytes = e.rcvAutoParams.copied |
| s.RcvAutoParams.PrevCopiedBytes = e.rcvAutoParams.prevCopied |
| s.RcvAutoParams.RTT = e.rcvAutoParams.rtt |
| s.RcvAutoParams.RTTMeasureSeqNumber = e.rcvAutoParams.rttMeasureSeqNumber |
| s.RcvAutoParams.RTTMeasureTime = e.rcvAutoParams.rttMeasureTime |
| s.RcvAutoParams.Disabled = e.rcvAutoParams.disabled |
| e.rcvListMu.Unlock() |
| |
| // Endpoint TCP Option state. |
| s.SendTSOk = e.sendTSOk |
| s.RecentTS = e.recentTS |
| s.TSOffset = e.tsOffset |
| s.SACKPermitted = e.sackPermitted |
| s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks) |
| copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks]) |
| s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy() |
| |
| // Copy endpoint send state. |
| e.sndBufMu.Lock() |
| s.SndBufSize = e.sndBufSize |
| s.SndBufUsed = e.sndBufUsed |
| s.SndClosed = e.sndClosed |
| s.SndBufInQueue = e.sndBufInQueue |
| s.PacketTooBigCount = e.packetTooBigCount |
| s.SndMTU = e.sndMTU |
| e.sndBufMu.Unlock() |
| |
| // Copy receiver state. |
| s.Receiver = stack.TCPReceiverState{ |
| RcvNxt: e.rcv.rcvNxt, |
| RcvAcc: e.rcv.rcvAcc, |
| RcvWndScale: e.rcv.rcvWndScale, |
| PendingBufUsed: e.rcv.pendingBufUsed, |
| PendingBufSize: e.rcv.pendingBufSize, |
| } |
| |
| // Copy sender state. |
| s.Sender = stack.TCPSenderState{ |
| LastSendTime: e.snd.lastSendTime, |
| DupAckCount: e.snd.dupAckCount, |
| FastRecovery: stack.TCPFastRecoveryState{ |
| Active: e.snd.fr.active, |
| First: e.snd.fr.first, |
| Last: e.snd.fr.last, |
| MaxCwnd: e.snd.fr.maxCwnd, |
| HighRxt: e.snd.fr.highRxt, |
| RescueRxt: e.snd.fr.rescueRxt, |
| }, |
| SndCwnd: e.snd.sndCwnd, |
| Ssthresh: e.snd.sndSsthresh, |
| SndCAAckCount: e.snd.sndCAAckCount, |
| Outstanding: e.snd.outstanding, |
| SndWnd: e.snd.sndWnd, |
| SndUna: e.snd.sndUna, |
| SndNxt: e.snd.sndNxt, |
| RTTMeasureSeqNum: e.snd.rttMeasureSeqNum, |
| RTTMeasureTime: e.snd.rttMeasureTime, |
| Closed: e.snd.closed, |
| RTO: e.snd.rto, |
| MaxPayloadSize: e.snd.maxPayloadSize, |
| SndWndScale: e.snd.sndWndScale, |
| MaxSentAck: e.snd.maxSentAck, |
| } |
| e.snd.rtt.Lock() |
| s.Sender.SRTT = e.snd.rtt.srtt |
| s.Sender.SRTTInited = e.snd.rtt.srttInited |
| e.snd.rtt.Unlock() |
| |
| if cubic, ok := e.snd.cc.(*cubicState); ok { |
| s.Sender.Cubic = stack.TCPCubicState{ |
| WMax: cubic.wMax, |
| WLastMax: cubic.wLastMax, |
| T: cubic.t, |
| TimeSinceLastCongestion: time.Since(cubic.t), |
| C: cubic.c, |
| K: cubic.k, |
| Beta: cubic.beta, |
| WC: cubic.wC, |
| WEst: cubic.wEst, |
| } |
| } |
| return s |
| } |
| |
| func (e *endpoint) initHardwareGSO() { |
| gso := &stack.GSO{} |
| switch e.route.NetProto { |
| case header.IPv4ProtocolNumber: |
| gso.Type = stack.GSOTCPv4 |
| gso.L3HdrLen = header.IPv4MinimumSize |
| case header.IPv6ProtocolNumber: |
| gso.Type = stack.GSOTCPv6 |
| gso.L3HdrLen = header.IPv6MinimumSize |
| default: |
| panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto)) |
| } |
| gso.NeedsCsum = true |
| gso.CsumOffset = header.TCPChecksumOffset |
| gso.MaxSize = e.route.GSOMaxSize() |
| e.gso = gso |
| } |
| |
| func (e *endpoint) initGSO() { |
| if e.route.Capabilities()&stack.CapabilityHardwareGSO != 0 { |
| e.initHardwareGSO() |
| } else if e.route.Capabilities()&stack.CapabilitySoftwareGSO != 0 { |
| e.gso = &stack.GSO{ |
| MaxSize: e.route.GSOMaxSize(), |
| Type: stack.GSOSW, |
| NeedsCsum: false, |
| } |
| } |
| } |
| |
| // State implements tcpip.Endpoint.State. It exports the endpoint's protocol |
| // state for diagnostics. |
| func (e *endpoint) State() uint32 { |
| e.mu.Lock() |
| defer e.mu.Unlock() |
| return uint32(e.state) |
| } |
| |
| // Info returns a copy of the endpoint info. |
| func (e *endpoint) Info() tcpip.EndpointInfo { |
| e.mu.RLock() |
| // Make a copy of the endpoint info. |
| ret := e.EndpointInfo |
| e.mu.RUnlock() |
| return &ret |
| } |
| |
| // Stats returns a pointer to the endpoint stats. |
| func (e *endpoint) Stats() tcpip.EndpointStats { |
| return &e.stats |
| } |
| |
| func mssForRoute(r *stack.Route) uint16 { |
| return uint16(r.MTU() - header.TCPMinimumSize) |
| } |