| // Copyright 2018 The gVisor Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| package tcp |
| |
| import ( |
| "fmt" |
| "math" |
| "sort" |
| "time" |
| |
| "gvisor.dev/gvisor/pkg/sleep" |
| "gvisor.dev/gvisor/pkg/sync" |
| "gvisor.dev/gvisor/pkg/tcpip" |
| "gvisor.dev/gvisor/pkg/tcpip/buffer" |
| "gvisor.dev/gvisor/pkg/tcpip/header" |
| "gvisor.dev/gvisor/pkg/tcpip/seqnum" |
| ) |
| |
| const ( |
| // MinRTO is the minimum allowed value for the retransmit timeout. |
| MinRTO = 200 * time.Millisecond |
| |
| // MaxRTO is the maximum allowed value for the retransmit timeout. |
| MaxRTO = 120 * time.Second |
| |
| // InitialCwnd is the initial congestion window. |
| InitialCwnd = 10 |
| |
| // nDupAckThreshold is the number of duplicate ACK's required |
| // before fast-retransmit is entered. |
| nDupAckThreshold = 3 |
| |
| // MaxRetries is the maximum number of probe retries sender does |
| // before timing out the connection. |
| // Linux default TCP_RETR2, net.ipv4.tcp_retries2. |
| MaxRetries = 15 |
| ) |
| |
| // congestionControl is an interface that must be implemented by any supported |
| // congestion control algorithm. |
| type congestionControl interface { |
| // HandleLossDetected is invoked when the loss is detected by RACK or |
| // sender.dupAckCount >= nDupAckThreshold just before entering fast |
| // retransmit. |
| HandleLossDetected() |
| |
| // HandleRTOExpired is invoked when the retransmit timer expires. |
| HandleRTOExpired() |
| |
| // Update is invoked when processing inbound acks. It's passed the |
| // number of packet's that were acked by the most recent cumulative |
| // acknowledgement. |
| Update(packetsAcked int) |
| |
| // PostRecovery is invoked when the sender is exiting a fast retransmit/ |
| // recovery phase. This provides congestion control algorithms a way |
| // to adjust their state when exiting recovery. |
| PostRecovery() |
| } |
| |
| // lossRecovery is an interface that must be implemented by any supported |
| // loss recovery algorithm. |
| type lossRecovery interface { |
| // DoRecovery is invoked when loss is detected and segments need |
| // to be retransmitted. The cumulative or selective ACK is passed along |
| // with the flag which identifies whether the connection entered fast |
| // retransmit with this ACK and to retransmit the first unacknowledged |
| // segment. |
| DoRecovery(rcvdSeg *segment, fastRetransmit bool) |
| } |
| |
| // sender holds the state necessary to send TCP segments. |
| // |
| // +stateify savable |
| type sender struct { |
| ep *endpoint |
| |
| // lastSendTime is the timestamp when the last packet was sent. |
| lastSendTime time.Time `state:".(unixTime)"` |
| |
| // dupAckCount is the number of duplicated acks received. It is used for |
| // fast retransmit. |
| dupAckCount int |
| |
| // fr holds state related to fast recovery. |
| fr fastRecovery |
| |
| // lr is the loss recovery algorithm used by the sender. |
| lr lossRecovery |
| |
| // sndCwnd is the congestion window, in packets. |
| sndCwnd int |
| |
| // sndSsthresh is the threshold between slow start and congestion |
| // avoidance. |
| sndSsthresh int |
| |
| // sndCAAckCount is the number of packets acknowledged during congestion |
| // avoidance. When enough packets have been ack'd (typically cwnd |
| // packets), the congestion window is incremented by one. |
| sndCAAckCount int |
| |
| // outstanding is the number of outstanding packets, that is, packets |
| // that have been sent but not yet acknowledged. |
| outstanding int |
| |
| // sackedOut is the number of packets which are selectively acked. |
| sackedOut int |
| |
| // sndWnd is the send window size. |
| sndWnd seqnum.Size |
| |
| // sndUna is the next unacknowledged sequence number. |
| sndUna seqnum.Value |
| |
| // sndNxt is the sequence number of the next segment to be sent. |
| sndNxt seqnum.Value |
| |
| // rttMeasureSeqNum is the sequence number being used for the latest RTT |
| // measurement. |
| rttMeasureSeqNum seqnum.Value |
| |
| // rttMeasureTime is the time when the rttMeasureSeqNum was sent. |
| rttMeasureTime time.Time `state:".(unixTime)"` |
| |
| // firstRetransmittedSegXmitTime is the original transmit time of |
| // the first segment that was retransmitted due to RTO expiration. |
| firstRetransmittedSegXmitTime time.Time `state:".(unixTime)"` |
| |
| // zeroWindowProbing is set if the sender is currently probing |
| // for zero receive window. |
| zeroWindowProbing bool `state:"nosave"` |
| |
| // unackZeroWindowProbes is the number of unacknowledged zero |
| // window probes. |
| unackZeroWindowProbes uint32 `state:"nosave"` |
| |
| closed bool |
| writeNext *segment |
| writeList segmentList |
| resendTimer timer `state:"nosave"` |
| resendWaker sleep.Waker `state:"nosave"` |
| |
| // rtt.srtt, rtt.rttvar, and rto are the "smoothed round-trip time", |
| // "round-trip time variation" and "retransmit timeout", as defined in |
| // section 2 of RFC 6298. |
| rtt rtt |
| rto time.Duration |
| |
| // minRTO is the minimum permitted value for sender.rto. |
| minRTO time.Duration |
| |
| // maxRTO is the maximum permitted value for sender.rto. |
| maxRTO time.Duration |
| |
| // maxRetries is the maximum permitted retransmissions. |
| maxRetries uint32 |
| |
| // maxPayloadSize is the maximum size of the payload of a given segment. |
| // It is initialized on demand. |
| maxPayloadSize int |
| |
| // gso is set if generic segmentation offload is enabled. |
| gso bool |
| |
| // sndWndScale is the number of bits to shift left when reading the send |
| // window size from a segment. |
| sndWndScale uint8 |
| |
| // maxSentAck is the maxium acknowledgement actually sent. |
| maxSentAck seqnum.Value |
| |
| // state is the current state of congestion control for this endpoint. |
| state tcpip.CongestionControlState |
| |
| // cc is the congestion control algorithm in use for this sender. |
| cc congestionControl |
| |
| // rc has the fields needed for implementing RACK loss detection |
| // algorithm. |
| rc rackControl |
| |
| // reorderTimer is the timer used to retransmit the segments after RACK |
| // detects them as lost. |
| reorderTimer timer `state:"nosave"` |
| reorderWaker sleep.Waker `state:"nosave"` |
| |
| // probeTimer and probeWaker are used to schedule PTO for RACK TLP algorithm. |
| probeTimer timer `state:"nosave"` |
| probeWaker sleep.Waker `state:"nosave"` |
| } |
| |
| // rtt is a synchronization wrapper used to appease stateify. See the comment |
| // in sender, where it is used. |
| // |
| // +stateify savable |
| type rtt struct { |
| sync.Mutex `state:"nosave"` |
| |
| srtt time.Duration |
| rttvar time.Duration |
| srttInited bool |
| } |
| |
| // fastRecovery holds information related to fast recovery from a packet loss. |
| // |
| // +stateify savable |
| type fastRecovery struct { |
| // active whether the endpoint is in fast recovery. The following fields |
| // are only meaningful when active is true. |
| active bool |
| |
| // first and last represent the inclusive sequence number range being |
| // recovered. |
| first seqnum.Value |
| last seqnum.Value |
| |
| // maxCwnd is the maximum value the congestion window may be inflated to |
| // due to duplicate acks. This exists to avoid attacks where the |
| // receiver intentionally sends duplicate acks to artificially inflate |
| // the sender's cwnd. |
| maxCwnd int |
| |
| // highRxt is the highest sequence number which has been retransmitted |
| // during the current loss recovery phase. |
| // See: RFC 6675 Section 2 for details. |
| highRxt seqnum.Value |
| |
| // rescueRxt is the highest sequence number which has been |
| // optimistically retransmitted to prevent stalling of the ACK clock |
| // when there is loss at the end of the window and no new data is |
| // available for transmission. |
| // See: RFC 6675 Section 2 for details. |
| rescueRxt seqnum.Value |
| } |
| |
| func newSender(ep *endpoint, iss, irs seqnum.Value, sndWnd seqnum.Size, mss uint16, sndWndScale int) *sender { |
| // The sender MUST reduce the TCP data length to account for any IP or |
| // TCP options that it is including in the packets that it sends. |
| // See: https://tools.ietf.org/html/rfc6691#section-2 |
| maxPayloadSize := int(mss) - ep.maxOptionSize() |
| |
| s := &sender{ |
| ep: ep, |
| sndWnd: sndWnd, |
| sndUna: iss + 1, |
| sndNxt: iss + 1, |
| rto: 1 * time.Second, |
| rttMeasureSeqNum: iss + 1, |
| lastSendTime: time.Now(), |
| maxPayloadSize: maxPayloadSize, |
| maxSentAck: irs + 1, |
| fr: fastRecovery{ |
| // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 1. |
| last: iss, |
| highRxt: iss, |
| rescueRxt: iss, |
| }, |
| gso: ep.gso != nil, |
| } |
| |
| if s.gso { |
| s.ep.gso.MSS = uint16(maxPayloadSize) |
| } |
| |
| s.cc = s.initCongestionControl(ep.cc) |
| s.lr = s.initLossRecovery() |
| s.rc.init(s, iss) |
| |
| // A negative sndWndScale means that no scaling is in use, otherwise we |
| // store the scaling value. |
| if sndWndScale > 0 { |
| s.sndWndScale = uint8(sndWndScale) |
| } |
| |
| s.resendTimer.init(&s.resendWaker) |
| s.reorderTimer.init(&s.reorderWaker) |
| s.probeTimer.init(&s.probeWaker) |
| |
| s.updateMaxPayloadSize(int(ep.route.MTU()), 0) |
| |
| // Initialize SACK Scoreboard after updating max payload size as we use |
| // the maxPayloadSize as the smss when determining if a segment is lost |
| // etc. |
| s.ep.scoreboard = NewSACKScoreboard(uint16(s.maxPayloadSize), iss) |
| |
| // Get Stack wide config. |
| var minRTO tcpip.TCPMinRTOOption |
| if err := ep.stack.TransportProtocolOption(ProtocolNumber, &minRTO); err != nil { |
| panic(fmt.Sprintf("unable to get minRTO from stack: %s", err)) |
| } |
| s.minRTO = time.Duration(minRTO) |
| |
| var maxRTO tcpip.TCPMaxRTOOption |
| if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRTO); err != nil { |
| panic(fmt.Sprintf("unable to get maxRTO from stack: %s", err)) |
| } |
| s.maxRTO = time.Duration(maxRTO) |
| |
| var maxRetries tcpip.TCPMaxRetriesOption |
| if err := ep.stack.TransportProtocolOption(ProtocolNumber, &maxRetries); err != nil { |
| panic(fmt.Sprintf("unable to get maxRetries from stack: %s", err)) |
| } |
| s.maxRetries = uint32(maxRetries) |
| |
| return s |
| } |
| |
| // initCongestionControl initializes the specified congestion control module and |
| // returns a handle to it. It also initializes the sndCwnd and sndSsThresh to |
| // their initial values. |
| func (s *sender) initCongestionControl(congestionControlName tcpip.CongestionControlOption) congestionControl { |
| s.sndCwnd = InitialCwnd |
| // Set sndSsthresh to the maximum int value, which depends on the |
| // platform. |
| s.sndSsthresh = int(^uint(0) >> 1) |
| |
| switch congestionControlName { |
| case ccCubic: |
| return newCubicCC(s) |
| case ccReno: |
| fallthrough |
| default: |
| return newRenoCC(s) |
| } |
| } |
| |
| // initLossRecovery initiates the loss recovery algorithm for the sender. |
| func (s *sender) initLossRecovery() lossRecovery { |
| if s.ep.sackPermitted { |
| return newSACKRecovery(s) |
| } |
| return newRenoRecovery(s) |
| } |
| |
| // updateMaxPayloadSize updates the maximum payload size based on the given |
| // MTU. If this is in response to "packet too big" control packets (indicated |
| // by the count argument), it also reduces the number of outstanding packets and |
| // attempts to retransmit the first packet above the MTU size. |
| func (s *sender) updateMaxPayloadSize(mtu, count int) { |
| m := mtu - header.TCPMinimumSize |
| |
| m -= s.ep.maxOptionSize() |
| |
| // We don't adjust up for now. |
| if m >= s.maxPayloadSize { |
| return |
| } |
| |
| // Make sure we can transmit at least one byte. |
| if m <= 0 { |
| m = 1 |
| } |
| |
| oldMSS := s.maxPayloadSize |
| s.maxPayloadSize = m |
| if s.gso { |
| s.ep.gso.MSS = uint16(m) |
| } |
| |
| if count == 0 { |
| // updateMaxPayloadSize is also called when the sender is created. |
| // and there is no data to send in such cases. Return immediately. |
| return |
| } |
| |
| // Update the scoreboard's smss to reflect the new lowered |
| // maxPayloadSize. |
| s.ep.scoreboard.smss = uint16(m) |
| |
| s.outstanding -= count |
| if s.outstanding < 0 { |
| s.outstanding = 0 |
| } |
| |
| // Rewind writeNext to the first segment exceeding the MTU. Do nothing |
| // if it is already before such a packet. |
| nextSeg := s.writeNext |
| for seg := s.writeList.Front(); seg != nil; seg = seg.Next() { |
| if seg == s.writeNext { |
| // We got to writeNext before we could find a segment |
| // exceeding the MTU. |
| break |
| } |
| |
| if nextSeg == s.writeNext && seg.data.Size() > m { |
| // We found a segment exceeding the MTU. Rewind |
| // writeNext and try to retransmit it. |
| nextSeg = seg |
| } |
| |
| if s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { |
| // Update sackedOut for new maximum payload size. |
| s.sackedOut -= s.pCount(seg, oldMSS) |
| s.sackedOut += s.pCount(seg, s.maxPayloadSize) |
| } |
| } |
| |
| // Since we likely reduced the number of outstanding packets, we may be |
| // ready to send some more. |
| s.writeNext = nextSeg |
| s.sendData() |
| } |
| |
| // sendAck sends an ACK segment. |
| func (s *sender) sendAck() { |
| s.sendSegmentFromView(buffer.VectorisedView{}, header.TCPFlagAck, s.sndNxt) |
| } |
| |
| // updateRTO updates the retransmit timeout when a new roud-trip time is |
| // available. This is done in accordance with section 2 of RFC 6298. |
| func (s *sender) updateRTO(rtt time.Duration) { |
| s.rtt.Lock() |
| if !s.rtt.srttInited { |
| s.rtt.rttvar = rtt / 2 |
| s.rtt.srtt = rtt |
| s.rtt.srttInited = true |
| } else { |
| diff := s.rtt.srtt - rtt |
| if diff < 0 { |
| diff = -diff |
| } |
| // Use RFC6298 standard algorithm to update rttvar and srtt when |
| // no timestamps are available. |
| if !s.ep.sendTSOk { |
| s.rtt.rttvar = (3*s.rtt.rttvar + diff) / 4 |
| s.rtt.srtt = (7*s.rtt.srtt + rtt) / 8 |
| } else { |
| // When we are taking RTT measurements of every ACK then |
| // we need to use a modified method as specified in |
| // https://tools.ietf.org/html/rfc7323#appendix-G |
| if s.outstanding == 0 { |
| s.rtt.Unlock() |
| return |
| } |
| // Netstack measures congestion window/inflight all in |
| // terms of packets and not bytes. This is similar to |
| // how linux also does cwnd and inflight. In practice |
| // this approximation works as expected. |
| expectedSamples := math.Ceil(float64(s.outstanding) / 2) |
| |
| // alpha & beta values are the original values as recommended in |
| // https://tools.ietf.org/html/rfc6298#section-2.3. |
| const alpha = 0.125 |
| const beta = 0.25 |
| |
| alphaPrime := alpha / expectedSamples |
| betaPrime := beta / expectedSamples |
| rttVar := (1-betaPrime)*s.rtt.rttvar.Seconds() + betaPrime*diff.Seconds() |
| srtt := (1-alphaPrime)*s.rtt.srtt.Seconds() + alphaPrime*rtt.Seconds() |
| s.rtt.rttvar = time.Duration(rttVar * float64(time.Second)) |
| s.rtt.srtt = time.Duration(srtt * float64(time.Second)) |
| } |
| } |
| |
| s.rto = s.rtt.srtt + 4*s.rtt.rttvar |
| s.rtt.Unlock() |
| if s.rto < s.minRTO { |
| s.rto = s.minRTO |
| } |
| } |
| |
| // resendSegment resends the first unacknowledged segment. |
| func (s *sender) resendSegment() { |
| // Don't use any segments we already sent to measure RTT as they may |
| // have been affected by packets being lost. |
| s.rttMeasureSeqNum = s.sndNxt |
| |
| // Resend the segment. |
| if seg := s.writeList.Front(); seg != nil { |
| if seg.data.Size() > s.maxPayloadSize { |
| s.splitSeg(seg, s.maxPayloadSize) |
| } |
| |
| // See: RFC 6675 section 5 Step 4.3 |
| // |
| // To prevent retransmission, set both the HighRXT and RescueRXT |
| // to the highest sequence number in the retransmitted segment. |
| s.fr.highRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1 |
| s.fr.rescueRxt = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) - 1 |
| s.sendSegment(seg) |
| s.ep.stack.Stats().TCP.FastRetransmit.Increment() |
| s.ep.stats.SendErrors.FastRetransmit.Increment() |
| |
| // Run SetPipe() as per RFC 6675 section 5 Step 4.4 |
| s.SetPipe() |
| } |
| } |
| |
| // retransmitTimerExpired is called when the retransmit timer expires, and |
| // unacknowledged segments are assumed lost, and thus need to be resent. |
| // Returns true if the connection is still usable, or false if the connection |
| // is deemed lost. |
| func (s *sender) retransmitTimerExpired() bool { |
| // Check if the timer actually expired or if it's a spurious wake due |
| // to a previously orphaned runtime timer. |
| if !s.resendTimer.checkExpiration() { |
| return true |
| } |
| |
| // TODO(b/147297758): Band-aid fix, retransmitTimer can fire in some edge cases |
| // when writeList is empty. Remove this once we have a proper fix for this |
| // issue. |
| if s.writeList.Front() == nil { |
| return true |
| } |
| |
| s.ep.stack.Stats().TCP.Timeouts.Increment() |
| s.ep.stats.SendErrors.Timeouts.Increment() |
| |
| // Set TLPRxtOut to false according to |
| // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. |
| s.rc.tlpRxtOut = false |
| |
| // Give up if we've waited more than a minute since the last resend or |
| // if a user time out is set and we have exceeded the user specified |
| // timeout since the first retransmission. |
| uto := s.ep.userTimeout |
| |
| if s.firstRetransmittedSegXmitTime.IsZero() { |
| // We store the original xmitTime of the segment that we are |
| // about to retransmit as the retransmission time. This is |
| // required as by the time the retransmitTimer has expired the |
| // segment has already been sent and unacked for the RTO at the |
| // time the segment was sent. |
| s.firstRetransmittedSegXmitTime = s.writeList.Front().xmitTime |
| } |
| |
| elapsed := time.Since(s.firstRetransmittedSegXmitTime) |
| remaining := s.maxRTO |
| if uto != 0 { |
| // Cap to the user specified timeout if one is specified. |
| remaining = uto - elapsed |
| } |
| |
| // Always honor the user-timeout irrespective of whether the zero |
| // window probes were acknowledged. |
| // net/ipv4/tcp_timer.c::tcp_probe_timer() |
| if remaining <= 0 || s.unackZeroWindowProbes >= s.maxRetries { |
| return false |
| } |
| |
| // Set new timeout. The timer will be restarted by the call to sendData |
| // below. |
| s.rto *= 2 |
| // Cap the RTO as per RFC 1122 4.2.3.1, RFC 6298 5.5 |
| if s.rto > s.maxRTO { |
| s.rto = s.maxRTO |
| } |
| |
| // Cap RTO to remaining time. |
| if s.rto > remaining { |
| s.rto = remaining |
| } |
| |
| // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 4. |
| // |
| // Retransmit timeouts: |
| // After a retransmit timeout, record the highest sequence number |
| // transmitted in the variable recover, and exit the fast recovery |
| // procedure if applicable. |
| s.fr.last = s.sndNxt - 1 |
| |
| if s.fr.active { |
| // We were attempting fast recovery but were not successful. |
| // Leave the state. We don't need to update ssthresh because it |
| // has already been updated when entered fast-recovery. |
| s.leaveRecovery() |
| } |
| |
| s.state = tcpip.RTORecovery |
| s.cc.HandleRTOExpired() |
| |
| // Mark the next segment to be sent as the first unacknowledged one and |
| // start sending again. Set the number of outstanding packets to 0 so |
| // that we'll be able to retransmit. |
| // |
| // We'll keep on transmitting (or retransmitting) as we get acks for |
| // the data we transmit. |
| s.outstanding = 0 |
| |
| // Expunge all SACK information as per https://tools.ietf.org/html/rfc6675#section-5.1 |
| // |
| // In order to avoid memory deadlocks, the TCP receiver is allowed to |
| // discard data that has already been selectively acknowledged. As a |
| // result, [RFC2018] suggests that a TCP sender SHOULD expunge the SACK |
| // information gathered from a receiver upon a retransmission timeout |
| // (RTO) "since the timeout might indicate that the data receiver has |
| // reneged." Additionally, a TCP sender MUST "ignore prior SACK |
| // information in determining which data to retransmit." |
| // |
| // NOTE: We take the stricter interpretation and just expunge all |
| // information as we lack more rigorous checks to validate if the SACK |
| // information is usable after an RTO. |
| s.ep.scoreboard.Reset() |
| s.writeNext = s.writeList.Front() |
| |
| // RFC 1122 4.2.2.17: Start sending zero window probes when we still see a |
| // zero receive window after retransmission interval and we have data to |
| // send. |
| if s.zeroWindowProbing { |
| s.sendZeroWindowProbe() |
| // RFC 1122 4.2.2.17: A TCP MAY keep its offered receive window closed |
| // indefinitely. As long as the receiving TCP continues to send |
| // acknowledgments in response to the probe segments, the sending TCP |
| // MUST allow the connection to stay open. |
| return true |
| } |
| |
| seg := s.writeNext |
| // RFC 1122 4.2.3.5: Close the connection when the number of |
| // retransmissions for this segment is beyond a limit. |
| if seg != nil && seg.xmitCount > s.maxRetries { |
| return false |
| } |
| |
| s.sendData() |
| |
| return true |
| } |
| |
| // pCount returns the number of packets in the segment. Due to GSO, a segment |
| // can be composed of multiple packets. |
| func (s *sender) pCount(seg *segment, maxPayloadSize int) int { |
| size := seg.data.Size() |
| if size == 0 { |
| return 1 |
| } |
| |
| return (size-1)/maxPayloadSize + 1 |
| } |
| |
| // splitSeg splits a given segment at the size specified and inserts the |
| // remainder as a new segment after the current one in the write list. |
| func (s *sender) splitSeg(seg *segment, size int) { |
| if seg.data.Size() <= size { |
| return |
| } |
| // Split this segment up. |
| nSeg := seg.clone() |
| nSeg.data.TrimFront(size) |
| nSeg.sequenceNumber.UpdateForward(seqnum.Size(size)) |
| s.writeList.InsertAfter(seg, nSeg) |
| |
| // The segment being split does not carry PUSH flag because it is |
| // followed by the newly split segment. |
| // RFC1122 section 4.2.2.2: MUST set the PSH bit in the last buffered |
| // segment (i.e., when there is no more queued data to be sent). |
| // Linux removes PSH flag only when the segment is being split over MSS |
| // and retains it when we are splitting the segment over lack of sender |
| // window space. |
| // ref: net/ipv4/tcp_output.c::tcp_write_xmit(), tcp_mss_split_point() |
| // ref: net/ipv4/tcp_output.c::tcp_write_wakeup(), tcp_snd_wnd_test() |
| if seg.data.Size() > s.maxPayloadSize { |
| seg.flags ^= header.TCPFlagPsh |
| } |
| |
| seg.data.CapLength(size) |
| } |
| |
| // NextSeg implements the RFC6675 NextSeg() operation. |
| // |
| // NextSeg starts scanning the writeList starting from nextSegHint and returns |
| // the hint to be passed on the next call to NextSeg. This is required to avoid |
| // iterating the write list repeatedly when NextSeg is invoked in a loop during |
| // recovery. The returned hint will be nil if there are no more segments that |
| // can match rules defined by NextSeg operation in RFC6675. |
| // |
| // rescueRtx will be true only if nextSeg is a rescue retransmission as |
| // described by Step 4) of the NextSeg algorithm. |
| func (s *sender) NextSeg(nextSegHint *segment) (nextSeg, hint *segment, rescueRtx bool) { |
| var s3 *segment |
| var s4 *segment |
| // Step 1. |
| for seg := nextSegHint; seg != nil; seg = seg.Next() { |
| // Stop iteration if we hit a segment that has never been |
| // transmitted (i.e. either it has no assigned sequence number |
| // or if it does have one, it's >= the next sequence number |
| // to be sent [i.e. >= s.sndNxt]). |
| if !s.isAssignedSequenceNumber(seg) || s.sndNxt.LessThanEq(seg.sequenceNumber) { |
| hint = nil |
| break |
| } |
| segSeq := seg.sequenceNumber |
| if smss := s.ep.scoreboard.SMSS(); seg.data.Size() > int(smss) { |
| s.splitSeg(seg, int(smss)) |
| } |
| |
| // See RFC 6675 Section 4 |
| // |
| // 1. If there exists a smallest unSACKED sequence number |
| // 'S2' that meets the following 3 criteria for determinig |
| // loss, the sequence range of one segment of up to SMSS |
| // octects starting with S2 MUST be returned. |
| if !s.ep.scoreboard.IsSACKED(header.SACKBlock{segSeq, segSeq.Add(1)}) { |
| // NextSeg(): |
| // |
| // (1.a) S2 is greater than HighRxt |
| // (1.b) S2 is less than highest octect covered by |
| // any received SACK. |
| if s.fr.highRxt.LessThan(segSeq) && segSeq.LessThan(s.ep.scoreboard.maxSACKED) { |
| // NextSeg(): |
| // (1.c) IsLost(S2) returns true. |
| if s.ep.scoreboard.IsLost(segSeq) { |
| return seg, seg.Next(), false |
| } |
| |
| // NextSeg(): |
| // |
| // (3): If the conditions for rules (1) and (2) |
| // fail, but there exists an unSACKed sequence |
| // number S3 that meets the criteria for |
| // detecting loss given in steps 1.a and 1.b |
| // above (specifically excluding (1.c)) then one |
| // segment of upto SMSS octets starting with S3 |
| // SHOULD be returned. |
| if s3 == nil { |
| s3 = seg |
| hint = seg.Next() |
| } |
| } |
| // NextSeg(): |
| // |
| // (4) If the conditions for (1), (2) and (3) fail, |
| // but there exists outstanding unSACKED data, we |
| // provide the opportunity for a single "rescue" |
| // retransmission per entry into loss recovery. If |
| // HighACK is greater than RescueRxt (or RescueRxt |
| // is undefined), then one segment of upto SMSS |
| // octects that MUST include the highest outstanding |
| // unSACKed sequence number SHOULD be returned, and |
| // RescueRxt set to RecoveryPoint. HighRxt MUST NOT |
| // be updated. |
| if s.fr.rescueRxt.LessThan(s.sndUna - 1) { |
| if s4 != nil { |
| if s4.sequenceNumber.LessThan(segSeq) { |
| s4 = seg |
| } |
| } else { |
| s4 = seg |
| } |
| } |
| } |
| } |
| |
| // If we got here then no segment matched step (1). |
| // Step (2): "If no sequence number 'S2' per rule (1) |
| // exists but there exists available unsent data and the |
| // receiver's advertised window allows, the sequence |
| // range of one segment of up to SMSS octets of |
| // previously unsent data starting with sequence number |
| // HighData+1 MUST be returned." |
| for seg := s.writeNext; seg != nil; seg = seg.Next() { |
| if s.isAssignedSequenceNumber(seg) && seg.sequenceNumber.LessThan(s.sndNxt) { |
| continue |
| } |
| // We do not split the segment here to <= smss as it has |
| // potentially not been assigned a sequence number yet. |
| return seg, nil, false |
| } |
| |
| if s3 != nil { |
| return s3, hint, false |
| } |
| |
| return s4, nil, true |
| } |
| |
| // maybeSendSegment tries to send the specified segment and either coalesces |
| // other segments into this one or splits the specified segment based on the |
| // lower of the specified limit value or the receivers window size specified by |
| // end. |
| func (s *sender) maybeSendSegment(seg *segment, limit int, end seqnum.Value) (sent bool) { |
| // We abuse the flags field to determine if we have already |
| // assigned a sequence number to this segment. |
| if !s.isAssignedSequenceNumber(seg) { |
| // Merge segments if allowed. |
| if seg.data.Size() != 0 { |
| available := int(s.sndNxt.Size(end)) |
| if available > limit { |
| available = limit |
| } |
| |
| // nextTooBig indicates that the next segment was too |
| // large to entirely fit in the current segment. It |
| // would be possible to split the next segment and merge |
| // the portion that fits, but unexpectedly splitting |
| // segments can have user visible side-effects which can |
| // break applications. For example, RFC 7766 section 8 |
| // says that the length and data of a DNS response |
| // should be sent in the same TCP segment to avoid |
| // triggering bugs in poorly written DNS |
| // implementations. |
| var nextTooBig bool |
| for seg.Next() != nil && seg.Next().data.Size() != 0 { |
| if seg.data.Size()+seg.Next().data.Size() > available { |
| nextTooBig = true |
| break |
| } |
| seg.data.Append(seg.Next().data) |
| |
| // Consume the segment that we just merged in. |
| s.writeList.Remove(seg.Next()) |
| } |
| if !nextTooBig && seg.data.Size() < available { |
| // Segment is not full. |
| if s.outstanding > 0 && s.ep.ops.GetDelayOption() { |
| // Nagle's algorithm. From Wikipedia: |
| // Nagle's algorithm works by |
| // combining a number of small |
| // outgoing messages and sending them |
| // all at once. Specifically, as long |
| // as there is a sent packet for which |
| // the sender has received no |
| // acknowledgment, the sender should |
| // keep buffering its output until it |
| // has a full packet's worth of |
| // output, thus allowing output to be |
| // sent all at once. |
| return false |
| } |
| // With TCP_CORK, hold back until minimum of the available |
| // send space and MSS. |
| // TODO(gvisor.dev/issue/2833): Drain the held segments after a |
| // timeout. |
| if seg.data.Size() < s.maxPayloadSize && s.ep.ops.GetCorkOption() { |
| return false |
| } |
| } |
| } |
| |
| // Assign flags. We don't do it above so that we can merge |
| // additional data if Nagle holds the segment. |
| seg.sequenceNumber = s.sndNxt |
| seg.flags = header.TCPFlagAck | header.TCPFlagPsh |
| } |
| |
| var segEnd seqnum.Value |
| if seg.data.Size() == 0 { |
| if s.writeList.Back() != seg { |
| panic("FIN segments must be the final segment in the write list.") |
| } |
| seg.flags = header.TCPFlagAck | header.TCPFlagFin |
| segEnd = seg.sequenceNumber.Add(1) |
| // Update the state to reflect that we have now |
| // queued a FIN. |
| switch s.ep.EndpointState() { |
| case StateCloseWait: |
| s.ep.setEndpointState(StateLastAck) |
| default: |
| s.ep.setEndpointState(StateFinWait1) |
| } |
| } else { |
| // We're sending a non-FIN segment. |
| if seg.flags&header.TCPFlagFin != 0 { |
| panic("Netstack queues FIN segments without data.") |
| } |
| |
| if !seg.sequenceNumber.LessThan(end) { |
| return false |
| } |
| |
| available := int(seg.sequenceNumber.Size(end)) |
| if available == 0 { |
| return false |
| } |
| |
| // If the whole segment or at least 1MSS sized segment cannot |
| // be accomodated in the receiver advertized window, skip |
| // splitting and sending of the segment. ref: |
| // net/ipv4/tcp_output.c::tcp_snd_wnd_test() |
| // |
| // Linux checks this for all segment transmits not triggered by |
| // a probe timer. On this condition, it defers the segment split |
| // and transmit to a short probe timer. |
| // |
| // ref: include/net/tcp.h::tcp_check_probe_timer() |
| // ref: net/ipv4/tcp_output.c::tcp_write_wakeup() |
| // |
| // Instead of defining a new transmit timer, we attempt to split |
| // the segment right here if there are no pending segments. If |
| // there are pending segments, segment transmits are deferred to |
| // the retransmit timer handler. |
| if s.sndUna != s.sndNxt { |
| switch { |
| case available >= seg.data.Size(): |
| // OK to send, the whole segments fits in the |
| // receiver's advertised window. |
| case available >= s.maxPayloadSize: |
| // OK to send, at least 1 MSS sized segment fits |
| // in the receiver's advertised window. |
| default: |
| return false |
| } |
| } |
| |
| // The segment size limit is computed as a function of sender |
| // congestion window and MSS. When sender congestion window is > |
| // 1, this limit can be larger than MSS. Ensure that the |
| // currently available send space is not greater than minimum of |
| // this limit and MSS. |
| if available > limit { |
| available = limit |
| } |
| |
| // If GSO is not in use then cap available to |
| // maxPayloadSize. When GSO is in use the gVisor GSO logic or |
| // the host GSO logic will cap the segment to the correct size. |
| if s.ep.gso == nil && available > s.maxPayloadSize { |
| available = s.maxPayloadSize |
| } |
| |
| if seg.data.Size() > available { |
| s.splitSeg(seg, available) |
| } |
| |
| segEnd = seg.sequenceNumber.Add(seqnum.Size(seg.data.Size())) |
| } |
| |
| s.sendSegment(seg) |
| |
| // Update sndNxt if we actually sent new data (as opposed to |
| // retransmitting some previously sent data). |
| if s.sndNxt.LessThan(segEnd) { |
| s.sndNxt = segEnd |
| } |
| |
| return true |
| } |
| |
| func (s *sender) sendZeroWindowProbe() { |
| ack, win := s.ep.rcv.getSendParams() |
| s.unackZeroWindowProbes++ |
| // Send a zero window probe with sequence number pointing to |
| // the last acknowledged byte. |
| s.ep.sendRaw(buffer.VectorisedView{}, header.TCPFlagAck, s.sndUna-1, ack, win) |
| // Rearm the timer to continue probing. |
| s.resendTimer.enable(s.rto) |
| } |
| |
| func (s *sender) enableZeroWindowProbing() { |
| s.zeroWindowProbing = true |
| // We piggyback the probing on the retransmit timer with the |
| // current retranmission interval, as we may start probing while |
| // segment retransmissions. |
| if s.firstRetransmittedSegXmitTime.IsZero() { |
| s.firstRetransmittedSegXmitTime = time.Now() |
| } |
| s.resendTimer.enable(s.rto) |
| } |
| |
| func (s *sender) disableZeroWindowProbing() { |
| s.zeroWindowProbing = false |
| s.unackZeroWindowProbes = 0 |
| s.firstRetransmittedSegXmitTime = time.Time{} |
| s.resendTimer.disable() |
| } |
| |
| func (s *sender) postXmit(dataSent bool, shouldScheduleProbe bool) { |
| if dataSent { |
| // We sent data, so we should stop the keepalive timer to ensure |
| // that no keepalives are sent while there is pending data. |
| s.ep.disableKeepaliveTimer() |
| } |
| |
| // If the sender has advertized zero receive window and we have |
| // data to be sent out, start zero window probing to query the |
| // the remote for it's receive window size. |
| if s.writeNext != nil && s.sndWnd == 0 { |
| s.enableZeroWindowProbing() |
| } |
| |
| // If we have no more pending data, start the keepalive timer. |
| if s.sndUna == s.sndNxt { |
| s.ep.resetKeepaliveTimer(false) |
| } else { |
| // Enable timers if we have pending data. |
| if shouldScheduleProbe && s.shouldSchedulePTO() { |
| // Schedule PTO after transmitting new data that wasn't itself a TLP probe. |
| s.schedulePTO() |
| } else if !s.resendTimer.enabled() { |
| s.probeTimer.disable() |
| if s.outstanding > 0 { |
| // Enable the resend timer if it's not enabled yet and there is |
| // outstanding data. |
| s.resendTimer.enable(s.rto) |
| } |
| } |
| } |
| } |
| |
| // sendData sends new data segments. It is called when data becomes available or |
| // when the send window opens up. |
| func (s *sender) sendData() { |
| limit := s.maxPayloadSize |
| if s.gso { |
| limit = int(s.ep.gso.MaxSize - header.TCPHeaderMaximumSize) |
| } |
| end := s.sndUna.Add(s.sndWnd) |
| |
| // Reduce the congestion window to min(IW, cwnd) per RFC 5681, page 10. |
| // "A TCP SHOULD set cwnd to no more than RW before beginning |
| // transmission if the TCP has not sent data in the interval exceeding |
| // the retrasmission timeout." |
| if !s.fr.active && s.state != tcpip.RTORecovery && time.Now().Sub(s.lastSendTime) > s.rto { |
| if s.sndCwnd > InitialCwnd { |
| s.sndCwnd = InitialCwnd |
| } |
| } |
| |
| var dataSent bool |
| for seg := s.writeNext; seg != nil && s.outstanding < s.sndCwnd; seg = seg.Next() { |
| cwndLimit := (s.sndCwnd - s.outstanding) * s.maxPayloadSize |
| if cwndLimit < limit { |
| limit = cwndLimit |
| } |
| if s.isAssignedSequenceNumber(seg) && s.ep.sackPermitted && s.ep.scoreboard.IsSACKED(seg.sackBlock()) { |
| // Move writeNext along so that we don't try and scan data that |
| // has already been SACKED. |
| s.writeNext = seg.Next() |
| continue |
| } |
| if sent := s.maybeSendSegment(seg, limit, end); !sent { |
| break |
| } |
| dataSent = true |
| s.outstanding += s.pCount(seg, s.maxPayloadSize) |
| s.writeNext = seg.Next() |
| } |
| |
| s.postXmit(dataSent, true /* shouldScheduleProbe */) |
| } |
| |
| func (s *sender) enterRecovery() { |
| s.fr.active = true |
| // Save state to reflect we're now in fast recovery. |
| // |
| // See : https://tools.ietf.org/html/rfc5681#section-3.2 Step 3. |
| // We inflate the cwnd by 3 to account for the 3 packets which triggered |
| // the 3 duplicate ACKs and are now not in flight. |
| s.sndCwnd = s.sndSsthresh + 3 |
| s.sackedOut = 0 |
| s.dupAckCount = 0 |
| s.fr.first = s.sndUna |
| s.fr.last = s.sndNxt - 1 |
| s.fr.maxCwnd = s.sndCwnd + s.outstanding |
| s.fr.highRxt = s.sndUna |
| s.fr.rescueRxt = s.sndUna |
| if s.ep.sackPermitted { |
| s.state = tcpip.SACKRecovery |
| s.ep.stack.Stats().TCP.SACKRecovery.Increment() |
| // Set TLPRxtOut to false according to |
| // https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.6.1. |
| if s.rc.tlpRxtOut { |
| // The tail loss probe triggered recovery. |
| s.ep.stack.Stats().TCP.TLPRecovery.Increment() |
| } |
| s.rc.tlpRxtOut = false |
| return |
| } |
| s.state = tcpip.FastRecovery |
| s.ep.stack.Stats().TCP.FastRecovery.Increment() |
| } |
| |
| func (s *sender) leaveRecovery() { |
| s.fr.active = false |
| s.fr.maxCwnd = 0 |
| s.dupAckCount = 0 |
| |
| // Deflate cwnd. It had been artificially inflated when new dups arrived. |
| s.sndCwnd = s.sndSsthresh |
| s.cc.PostRecovery() |
| } |
| |
| // isAssignedSequenceNumber relies on the fact that we only set flags once a |
| // sequencenumber is assigned and that is only done right before we send the |
| // segment. As a result any segment that has a non-zero flag has a valid |
| // sequence number assigned to it. |
| func (s *sender) isAssignedSequenceNumber(seg *segment) bool { |
| return seg.flags != 0 |
| } |
| |
| // SetPipe implements the SetPipe() function described in RFC6675. Netstack |
| // maintains the congestion window in number of packets and not bytes, so |
| // SetPipe() here measures number of outstanding packets rather than actual |
| // outstanding bytes in the network. |
| func (s *sender) SetPipe() { |
| // If SACK isn't permitted or it is permitted but recovery is not active |
| // then ignore pipe calculations. |
| if !s.ep.sackPermitted || !s.fr.active { |
| return |
| } |
| pipe := 0 |
| smss := seqnum.Size(s.ep.scoreboard.SMSS()) |
| for s1 := s.writeList.Front(); s1 != nil && s1.data.Size() != 0 && s.isAssignedSequenceNumber(s1); s1 = s1.Next() { |
| // With GSO each segment can be much larger than SMSS. So check the segment |
| // in SMSS sized ranges. |
| segEnd := s1.sequenceNumber.Add(seqnum.Size(s1.data.Size())) |
| for startSeq := s1.sequenceNumber; startSeq.LessThan(segEnd); startSeq = startSeq.Add(smss) { |
| endSeq := startSeq.Add(smss) |
| if segEnd.LessThan(endSeq) { |
| endSeq = segEnd |
| } |
| sb := header.SACKBlock{startSeq, endSeq} |
| // SetPipe(): |
| // |
| // After initializing pipe to zero, the following steps are |
| // taken for each octet 'S1' in the sequence space between |
| // HighACK and HighData that has not been SACKed: |
| if !s1.sequenceNumber.LessThan(s.sndNxt) { |
| break |
| } |
| if s.ep.scoreboard.IsSACKED(sb) { |
| continue |
| } |
| |
| // SetPipe(): |
| // |
| // (a) If IsLost(S1) returns false, Pipe is incremened by 1. |
| // |
| // NOTE: here we mark the whole segment as lost. We do not try |
| // and test every byte in our write buffer as we maintain our |
| // pipe in terms of oustanding packets and not bytes. |
| if !s.ep.scoreboard.IsRangeLost(sb) { |
| pipe++ |
| } |
| // SetPipe(): |
| // (b) If S1 <= HighRxt, Pipe is incremented by 1. |
| if s1.sequenceNumber.LessThanEq(s.fr.highRxt) { |
| pipe++ |
| } |
| } |
| } |
| s.outstanding = pipe |
| } |
| |
| // shouldEnterRecovery returns true if the sender should enter fast recovery |
| // based on dupAck count and sack scoreboard. |
| // See RFC 6675 section 5. |
| func (s *sender) shouldEnterRecovery() bool { |
| return s.dupAckCount >= nDupAckThreshold || |
| (s.ep.sackPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 && s.ep.scoreboard.IsLost(s.sndUna)) |
| } |
| |
| // detectLoss is called when an ack is received and returns whether a loss is |
| // detected. It manages the state related to duplicate acks and determines if |
| // a retransmit is needed according to the rules in RFC 6582 (NewReno). |
| func (s *sender) detectLoss(seg *segment) (fastRetransmit bool) { |
| // We're not in fast recovery yet. |
| |
| // If RACK is enabled and there is no reordering we should honor the |
| // three duplicate ACK rule to enter recovery. |
| // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-4 |
| if s.ep.sackPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { |
| if s.rc.reorderSeen { |
| return false |
| } |
| } |
| |
| if !s.isDupAck(seg) { |
| s.dupAckCount = 0 |
| return false |
| } |
| |
| s.dupAckCount++ |
| |
| // Do not enter fast recovery until we reach nDupAckThreshold or the |
| // first unacknowledged byte is considered lost as per SACK scoreboard. |
| if !s.shouldEnterRecovery() { |
| // RFC 6675 Step 3. |
| s.fr.highRxt = s.sndUna - 1 |
| // Do run SetPipe() to calculate the outstanding segments. |
| s.SetPipe() |
| s.state = tcpip.Disorder |
| return false |
| } |
| |
| // See: https://tools.ietf.org/html/rfc6582#section-3.2 Step 2 |
| // |
| // We only do the check here, the incrementing of last to the highest |
| // sequence number transmitted till now is done when enterRecovery |
| // is invoked. |
| // |
| // Note that we only enter recovery when at least one more byte of data |
| // beyond s.fr.last (the highest byte that was outstanding when fast |
| // retransmit was last entered) is acked. |
| if !s.fr.last.LessThan(seg.ackNumber - 1) { |
| s.dupAckCount = 0 |
| return false |
| } |
| s.cc.HandleLossDetected() |
| s.enterRecovery() |
| return true |
| } |
| |
| // isDupAck determines if seg is a duplicate ack as defined in |
| // https://tools.ietf.org/html/rfc5681#section-2. |
| func (s *sender) isDupAck(seg *segment) bool { |
| // A TCP that utilizes selective acknowledgments (SACKs) [RFC2018, RFC2883] |
| // can leverage the SACK information to determine when an incoming ACK is a |
| // "duplicate" (e.g., if the ACK contains previously unknown SACK |
| // information). |
| if s.ep.sackPermitted && !seg.hasNewSACKInfo { |
| return false |
| } |
| |
| // (a) The receiver of the ACK has outstanding data. |
| return s.sndUna != s.sndNxt && |
| // (b) The incoming acknowledgment carries no data. |
| seg.logicalLen() == 0 && |
| // (c) The SYN and FIN bits are both off. |
| !seg.flagIsSet(header.TCPFlagFin) && !seg.flagIsSet(header.TCPFlagSyn) && |
| // (d) the ACK number is equal to the greatest acknowledgment received on |
| // the given connection (TCP.UNA from RFC793). |
| seg.ackNumber == s.sndUna && |
| // (e) the advertised window in the incoming acknowledgment equals the |
| // advertised window in the last incoming acknowledgment. |
| s.sndWnd == seg.window |
| } |
| |
| // Iterate the writeList and update RACK for each segment which is newly acked |
| // either cumulatively or selectively. Loop through the segments which are |
| // sacked, and update the RACK related variables and check for reordering. |
| // |
| // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 |
| // steps 2 and 3. |
| func (s *sender) walkSACK(rcvdSeg *segment) { |
| s.rc.setDSACKSeen(false) |
| |
| // Look for DSACK block. |
| idx := 0 |
| n := len(rcvdSeg.parsedOptions.SACKBlocks) |
| if checkDSACK(rcvdSeg) { |
| s.rc.setDSACKSeen(true) |
| idx = 1 |
| n-- |
| } |
| |
| if n == 0 { |
| return |
| } |
| |
| // Sort the SACK blocks. The first block is the most recent unacked |
| // block. The following blocks can be in arbitrary order. |
| sackBlocks := make([]header.SACKBlock, n) |
| copy(sackBlocks, rcvdSeg.parsedOptions.SACKBlocks[idx:]) |
| sort.Slice(sackBlocks, func(i, j int) bool { |
| return sackBlocks[j].Start.LessThan(sackBlocks[i].Start) |
| }) |
| |
| seg := s.writeList.Front() |
| for _, sb := range sackBlocks { |
| for seg != nil && seg.sequenceNumber.LessThan(sb.End) && seg.xmitCount != 0 { |
| if sb.Start.LessThanEq(seg.sequenceNumber) && !seg.acked { |
| s.rc.update(seg, rcvdSeg) |
| s.rc.detectReorder(seg) |
| seg.acked = true |
| s.sackedOut += s.pCount(seg, s.maxPayloadSize) |
| } |
| seg = seg.Next() |
| } |
| } |
| } |
| |
| // checkDSACK checks if a DSACK is reported. |
| func checkDSACK(rcvdSeg *segment) bool { |
| n := len(rcvdSeg.parsedOptions.SACKBlocks) |
| if n == 0 { |
| return false |
| } |
| |
| sb := rcvdSeg.parsedOptions.SACKBlocks[0] |
| // Check if SACK block is invalid. |
| if sb.End.LessThan(sb.Start) { |
| return false |
| } |
| |
| // See: https://tools.ietf.org/html/rfc2883#section-5 DSACK is sent in |
| // at most one SACK block. DSACK is detected in the below two cases: |
| // * If the SACK sequence space is less than this cumulative ACK, it is |
| // an indication that the segment identified by the SACK block has |
| // been received more than once by the receiver. |
| // * If the sequence space in the first SACK block is greater than the |
| // cumulative ACK, then the sender next compares the sequence space |
| // in the first SACK block with the sequence space in the second SACK |
| // block, if there is one. This comparison can determine if the first |
| // SACK block is reporting duplicate data that lies above the |
| // cumulative ACK. |
| if sb.Start.LessThan(rcvdSeg.ackNumber) { |
| return true |
| } |
| |
| if n > 1 { |
| sb1 := rcvdSeg.parsedOptions.SACKBlocks[1] |
| if sb1.End.LessThan(sb1.Start) { |
| return false |
| } |
| |
| // If the first SACK block is fully covered by second SACK |
| // block, then the first block is a DSACK block. |
| if sb.End.LessThanEq(sb1.End) && sb1.Start.LessThanEq(sb.Start) { |
| return true |
| } |
| } |
| |
| return false |
| } |
| |
| // handleRcvdSegment is called when a segment is received; it is responsible for |
| // updating the send-related state. |
| func (s *sender) handleRcvdSegment(rcvdSeg *segment) { |
| // Check if we can extract an RTT measurement from this ack. |
| if !rcvdSeg.parsedOptions.TS && s.rttMeasureSeqNum.LessThan(rcvdSeg.ackNumber) { |
| s.updateRTO(time.Now().Sub(s.rttMeasureTime)) |
| s.rttMeasureSeqNum = s.sndNxt |
| } |
| |
| // Update Timestamp if required. See RFC7323, section-4.3. |
| if s.ep.sendTSOk && rcvdSeg.parsedOptions.TS { |
| s.ep.updateRecentTimestamp(rcvdSeg.parsedOptions.TSVal, s.maxSentAck, rcvdSeg.sequenceNumber) |
| } |
| |
| // Insert SACKBlock information into our scoreboard. |
| if s.ep.sackPermitted { |
| for _, sb := range rcvdSeg.parsedOptions.SACKBlocks { |
| // Only insert the SACK block if the following holds |
| // true: |
| // * SACK block acks data after the ack number in the |
| // current segment. |
| // * SACK block represents a sequence |
| // between sndUna and sndNxt (i.e. data that is |
| // currently unacked and in-flight). |
| // * SACK block that has not been SACKed already. |
| // |
| // NOTE: This check specifically excludes DSACK blocks |
| // which have start/end before sndUna and are used to |
| // indicate spurious retransmissions. |
| if rcvdSeg.ackNumber.LessThan(sb.Start) && s.sndUna.LessThan(sb.Start) && sb.End.LessThanEq(s.sndNxt) && !s.ep.scoreboard.IsSACKED(sb) { |
| s.ep.scoreboard.Insert(sb) |
| rcvdSeg.hasNewSACKInfo = true |
| } |
| } |
| |
| // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08 |
| // section-7.2 |
| // * Step 2: Update RACK stats. |
| // If the ACK is not ignored as invalid, update the RACK.rtt |
| // to be the RTT sample calculated using this ACK, and |
| // continue. If this ACK or SACK was for the most recently |
| // sent packet, then record the RACK.xmit_ts timestamp and |
| // RACK.end_seq sequence implied by this ACK. |
| // * Step 3: Detect packet reordering. |
| // If the ACK selectively or cumulatively acknowledges an |
| // unacknowledged and also never retransmitted sequence below |
| // RACK.fack, then the corresponding packet has been |
| // reordered and RACK.reord is set to TRUE. |
| if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { |
| s.walkSACK(rcvdSeg) |
| } |
| s.SetPipe() |
| } |
| |
| ack := rcvdSeg.ackNumber |
| fastRetransmit := false |
| // Do not leave fast recovery, if the ACK is out of range. |
| if s.fr.active { |
| // Leave fast recovery if it acknowledges all the data covered by |
| // this fast recovery session. |
| if (ack-1).InRange(s.sndUna, s.sndNxt) && s.fr.last.LessThan(ack) { |
| s.leaveRecovery() |
| } |
| } else { |
| // Detect loss by counting the duplicates and enter recovery. |
| fastRetransmit = s.detectLoss(rcvdSeg) |
| } |
| |
| // See if TLP based recovery was successful. |
| if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { |
| s.detectTLPRecovery(ack, rcvdSeg) |
| } |
| |
| // Stash away the current window size. |
| s.sndWnd = rcvdSeg.window |
| |
| // Disable zero window probing if remote advertizes a non-zero receive |
| // window. This can be with an ACK to the zero window probe (where the |
| // acknumber refers to the already acknowledged byte) OR to any previously |
| // unacknowledged segment. |
| if s.zeroWindowProbing && rcvdSeg.window > 0 && |
| (ack == s.sndUna || (ack-1).InRange(s.sndUna, s.sndNxt)) { |
| s.disableZeroWindowProbing() |
| } |
| |
| // On receiving the ACK for the zero window probe, account for it and |
| // skip trying to send any segment as we are still probing for |
| // receive window to become non-zero. |
| if s.zeroWindowProbing && s.unackZeroWindowProbes > 0 && ack == s.sndUna { |
| s.unackZeroWindowProbes-- |
| return |
| } |
| |
| // Ignore ack if it doesn't acknowledge any new data. |
| if (ack - 1).InRange(s.sndUna, s.sndNxt) { |
| s.dupAckCount = 0 |
| |
| // See : https://tools.ietf.org/html/rfc1323#section-3.3. |
| // Specifically we should only update the RTO using TSEcr if the |
| // following condition holds: |
| // |
| // A TSecr value received in a segment is used to update the |
| // averaged RTT measurement only if the segment acknowledges |
| // some new data, i.e., only if it advances the left edge of |
| // the send window. |
| if s.ep.sendTSOk && rcvdSeg.parsedOptions.TSEcr != 0 { |
| // TSVal/Ecr values sent by Netstack are at a millisecond |
| // granularity. |
| elapsed := time.Duration(s.ep.timestamp()-rcvdSeg.parsedOptions.TSEcr) * time.Millisecond |
| s.updateRTO(elapsed) |
| } |
| |
| if s.shouldSchedulePTO() { |
| // Schedule PTO upon receiving an ACK that cumulatively acknowledges data. |
| // See https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.5.1. |
| s.schedulePTO() |
| } else { |
| // When an ack is received we must rearm the timer. |
| // RFC 6298 5.3 |
| s.probeTimer.disable() |
| s.resendTimer.enable(s.rto) |
| } |
| |
| // Remove all acknowledged data from the write list. |
| acked := s.sndUna.Size(ack) |
| s.sndUna = ack |
| |
| // The remote ACK-ing at least 1 byte is an indication that we have a |
| // full-duplex connection to the remote as the only way we will receive an |
| // ACK is if the remote received data that we previously sent. |
| // |
| // As of writing, linux seems to only confirm a route as reachable when |
| // forward progress is made which is indicated by an ACK that removes data |
| // from the retransmit queue. |
| if acked > 0 { |
| s.ep.route.ConfirmReachable() |
| } |
| |
| ackLeft := acked |
| originalOutstanding := s.outstanding |
| for ackLeft > 0 { |
| // We use logicalLen here because we can have FIN |
| // segments (which are always at the end of list) that |
| // have no data, but do consume a sequence number. |
| seg := s.writeList.Front() |
| datalen := seg.logicalLen() |
| |
| if datalen > ackLeft { |
| prevCount := s.pCount(seg, s.maxPayloadSize) |
| seg.data.TrimFront(int(ackLeft)) |
| seg.sequenceNumber.UpdateForward(ackLeft) |
| s.outstanding -= prevCount - s.pCount(seg, s.maxPayloadSize) |
| break |
| } |
| |
| if s.writeNext == seg { |
| s.writeNext = seg.Next() |
| } |
| |
| // Update the RACK fields if SACK is enabled. |
| if s.ep.sackPermitted && !seg.acked && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { |
| s.rc.update(seg, rcvdSeg) |
| s.rc.detectReorder(seg) |
| } |
| |
| s.writeList.Remove(seg) |
| |
| // If SACK is enabled then only reduce outstanding if |
| // the segment was not previously SACKED as these have |
| // already been accounted for in SetPipe(). |
| if !s.ep.sackPermitted || !s.ep.scoreboard.IsSACKED(seg.sackBlock()) { |
| s.outstanding -= s.pCount(seg, s.maxPayloadSize) |
| } else { |
| s.sackedOut -= s.pCount(seg, s.maxPayloadSize) |
| } |
| seg.decRef() |
| ackLeft -= datalen |
| } |
| |
| // Update the send buffer usage and notify potential waiters. |
| s.ep.updateSndBufferUsage(int(acked)) |
| |
| // Clear SACK information for all acked data. |
| s.ep.scoreboard.Delete(s.sndUna) |
| |
| // If we are not in fast recovery then update the congestion |
| // window based on the number of acknowledged packets. |
| if !s.fr.active { |
| s.cc.Update(originalOutstanding - s.outstanding) |
| if s.fr.last.LessThan(s.sndUna) { |
| s.state = tcpip.Open |
| // Update RACK when we are exiting fast or RTO |
| // recovery as described in the RFC |
| // draft-ietf-tcpm-rack-08 Section-7.2 Step 4. |
| if s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { |
| s.rc.exitRecovery() |
| } |
| s.reorderTimer.disable() |
| } |
| } |
| |
| // It is possible for s.outstanding to drop below zero if we get |
| // a retransmit timeout, reset outstanding to zero but later |
| // get an ack that cover previously sent data. |
| if s.outstanding < 0 { |
| s.outstanding = 0 |
| } |
| |
| s.SetPipe() |
| |
| // If all outstanding data was acknowledged the disable the timer. |
| // RFC 6298 Rule 5.3 |
| if s.sndUna == s.sndNxt { |
| s.outstanding = 0 |
| // Reset firstRetransmittedSegXmitTime to the zero value. |
| s.firstRetransmittedSegXmitTime = time.Time{} |
| s.resendTimer.disable() |
| s.probeTimer.disable() |
| } |
| } |
| |
| if s.ep.sackPermitted && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection != 0 { |
| // Update RACK reorder window. |
| // See: https://tools.ietf.org/html/draft-ietf-tcpm-rack-08#section-7.2 |
| // * Upon receiving an ACK: |
| // * Step 4: Update RACK reordering window |
| s.rc.updateRACKReorderWindow(rcvdSeg) |
| |
| // After the reorder window is calculated, detect any loss by checking |
| // if the time elapsed after the segments are sent is greater than the |
| // reorder window. |
| if numLost := s.rc.detectLoss(rcvdSeg.rcvdTime); numLost > 0 && !s.fr.active { |
| // If any segment is marked as lost by |
| // RACK, enter recovery and retransmit |
| // the lost segments. |
| s.cc.HandleLossDetected() |
| s.enterRecovery() |
| fastRetransmit = true |
| } |
| |
| if s.fr.active { |
| s.rc.DoRecovery(nil, fastRetransmit) |
| } |
| } |
| |
| // Now that we've popped all acknowledged data from the retransmit |
| // queue, retransmit if needed. |
| if s.fr.active && s.ep.tcpRecovery&tcpip.TCPRACKLossDetection == 0 { |
| s.lr.DoRecovery(rcvdSeg, fastRetransmit) |
| // When SACK is enabled data sending is governed by steps in |
| // RFC 6675 Section 5 recovery steps A-C. |
| // See: https://tools.ietf.org/html/rfc6675#section-5. |
| if s.ep.sackPermitted { |
| return |
| } |
| } |
| |
| // Send more data now that some of the pending data has been ack'd, or |
| // that the window opened up, or the congestion window was inflated due |
| // to a duplicate ack during fast recovery. This will also re-enable |
| // the retransmit timer if needed. |
| s.sendData() |
| } |
| |
| // sendSegment sends the specified segment. |
| func (s *sender) sendSegment(seg *segment) tcpip.Error { |
| if seg.xmitCount > 0 { |
| s.ep.stack.Stats().TCP.Retransmits.Increment() |
| s.ep.stats.SendErrors.Retransmits.Increment() |
| if s.sndCwnd < s.sndSsthresh { |
| s.ep.stack.Stats().TCP.SlowStartRetransmits.Increment() |
| } |
| } |
| seg.xmitTime = time.Now() |
| seg.xmitCount++ |
| seg.lost = false |
| err := s.sendSegmentFromView(seg.data, seg.flags, seg.sequenceNumber) |
| |
| // Every time a packet containing data is sent (including a |
| // retransmission), if SACK is enabled and we are retransmitting data |
| // then use the conservative timer described in RFC6675 Section 6.0, |
| // otherwise follow the standard time described in RFC6298 Section 5.1. |
| if err != nil && seg.data.Size() != 0 { |
| if s.fr.active && seg.xmitCount > 1 && s.ep.sackPermitted { |
| s.resendTimer.enable(s.rto) |
| } else { |
| if !s.resendTimer.enabled() { |
| s.resendTimer.enable(s.rto) |
| } |
| } |
| } |
| |
| return err |
| } |
| |
| // sendSegmentFromView sends a new segment containing the given payload, flags |
| // and sequence number. |
| func (s *sender) sendSegmentFromView(data buffer.VectorisedView, flags header.TCPFlags, seq seqnum.Value) tcpip.Error { |
| s.lastSendTime = time.Now() |
| if seq == s.rttMeasureSeqNum { |
| s.rttMeasureTime = s.lastSendTime |
| } |
| |
| rcvNxt, rcvWnd := s.ep.rcv.getSendParams() |
| |
| // Remember the max sent ack. |
| s.maxSentAck = rcvNxt |
| |
| return s.ep.sendRaw(data, flags, seq, rcvNxt, rcvWnd) |
| } |
| |
| // maybeSendOutOfWindowAck sends an ACK if we are not being rate limited |
| // currently. |
| func (s *sender) maybeSendOutOfWindowAck(seg *segment) { |
| // Data packets are unlikely to be part of an ACK loop. So always send |
| // an ACK for a packet w/ data. |
| if seg.payloadSize() > 0 || s.ep.allowOutOfWindowAck() { |
| s.sendAck() |
| } |
| } |